OSDN Git Service

Detect SSE4.1 support for Subzero.
[android-x86/external-swiftshader.git] / src / Shader / PixelRoutine.cpp
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelRoutine.hpp"
16
17 #include "Renderer.hpp"
18 #include "QuadRasterizer.hpp"
19 #include "Surface.hpp"
20 #include "Primitive.hpp"
21 #include "SamplerCore.hpp"
22 #include "Constants.hpp"
23 #include "Debug.hpp"
24
25 namespace sw
26 {
27         extern bool complementaryDepthBuffer;
28         extern bool postBlendSRGB;
29         extern bool exactColorRounding;
30         extern bool forceClearRegisters;
31
32         PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
33         {
34                 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
35                 {
36                         for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
37                         {
38                                 v[i].x = Float4(0.0f);
39                                 v[i].y = Float4(0.0f);
40                                 v[i].z = Float4(0.0f);
41                                 v[i].w = Float4(0.0f);
42                         }
43                 }
44         }
45
46         PixelRoutine::~PixelRoutine()
47         {
48                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
49                 {
50                         delete sampler[i];
51                 }
52         }
53
54         void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
55         {
56                 #if PERF_PROFILE
57                         Long pipeTime = Ticks();
58                 #endif
59
60                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
61                 {
62                         sampler[i] = new SamplerCore(constants, state.sampler[i]);
63                 }
64
65                 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
66
67                 Int zMask[4];   // Depth mask
68                 Int sMask[4];   // Stencil mask
69
70                 for(unsigned int q = 0; q < state.multiSample; q++)
71                 {
72                         zMask[q] = cMask[q];
73                         sMask[q] = cMask[q];
74                 }
75
76                 for(unsigned int q = 0; q < state.multiSample; q++)
77                 {
78                         stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
79                 }
80
81                 Float4 f;
82                 Float4 rhwCentroid;
83
84                 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
85
86                 if(interpolateZ())
87                 {
88                         for(unsigned int q = 0; q < state.multiSample; q++)
89                         {
90                                 Float4 x = xxxx;
91
92                                 if(state.multiSample > 1)
93                                 {
94                                         x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
95                                 }
96
97                                 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
98                         }
99                 }
100
101                 Bool depthPass = false;
102
103                 if(earlyDepthTest)
104                 {
105                         for(unsigned int q = 0; q < state.multiSample; q++)
106                         {
107                                 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
108                         }
109                 }
110
111                 If(depthPass || Bool(!earlyDepthTest))
112                 {
113                         #if PERF_PROFILE
114                                 Long interpTime = Ticks();
115                         #endif
116
117                         Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
118
119                         // Centroid locations
120                         Float4 XXXX = Float4(0.0f);
121                         Float4 YYYY = Float4(0.0f);
122
123                         if(state.centroid)
124                         {
125                                 Float4 WWWW(1.0e-9f);
126
127                                 for(unsigned int q = 0; q < state.multiSample; q++)
128                                 {
129                                         XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
130                                         YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
131                                         WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
132                                 }
133
134                                 WWWW = Rcp_pp(WWWW);
135                                 XXXX *= WWWW;
136                                 YYYY *= WWWW;
137
138                                 XXXX += xxxx;
139                                 YYYY += yyyy;
140                         }
141
142                         if(interpolateW())
143                         {
144                                 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
145                                 rhw = reciprocal(w, false, false, true);
146
147                                 if(state.centroid)
148                                 {
149                                         rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
150                                 }
151                         }
152
153                         for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
154                         {
155                                 for(int component = 0; component < 4; component++)
156                                 {
157                                         if(state.interpolant[interpolant].component & (1 << component))
158                                         {
159                                                 if(!state.interpolant[interpolant].centroid)
160                                                 {
161                                                         v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
162                                                 }
163                                                 else
164                                                 {
165                                                         v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
166                                                 }
167                                         }
168                                 }
169
170                                 Float4 rcp;
171
172                                 switch(state.interpolant[interpolant].project)
173                                 {
174                                 case 0:
175                                         break;
176                                 case 1:
177                                         rcp = reciprocal(v[interpolant].y);
178                                         v[interpolant].x = v[interpolant].x * rcp;
179                                         break;
180                                 case 2:
181                                         rcp = reciprocal(v[interpolant].z);
182                                         v[interpolant].x = v[interpolant].x * rcp;
183                                         v[interpolant].y = v[interpolant].y * rcp;
184                                         break;
185                                 case 3:
186                                         rcp = reciprocal(v[interpolant].w);
187                                         v[interpolant].x = v[interpolant].x * rcp;
188                                         v[interpolant].y = v[interpolant].y * rcp;
189                                         v[interpolant].z = v[interpolant].z * rcp;
190                                         break;
191                                 }
192                         }
193
194                         if(state.fog.component)
195                         {
196                                 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
197                         }
198
199                         setBuiltins(x, y, z, w);
200
201                         #if PERF_PROFILE
202                                 cycles[PERF_INTERP] += Ticks() - interpTime;
203                         #endif
204
205                         Bool alphaPass = true;
206
207                         if(colorUsed())
208                         {
209                                 #if PERF_PROFILE
210                                         Long shaderTime = Ticks();
211                                 #endif
212
213                                 applyShader(cMask);
214
215                                 #if PERF_PROFILE
216                                         cycles[PERF_SHADER] += Ticks() - shaderTime;
217                                 #endif
218
219                                 alphaPass = alphaTest(cMask);
220
221                                 if((shader && shader->containsKill()) || state.alphaTestActive())
222                                 {
223                                         for(unsigned int q = 0; q < state.multiSample; q++)
224                                         {
225                                                 zMask[q] &= cMask[q];
226                                                 sMask[q] &= cMask[q];
227                                         }
228                                 }
229                         }
230
231                         If(alphaPass)
232                         {
233                                 if(!earlyDepthTest)
234                                 {
235                                         for(unsigned int q = 0; q < state.multiSample; q++)
236                                         {
237                                                 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
238                                         }
239                                 }
240
241                                 #if PERF_PROFILE
242                                         Long ropTime = Ticks();
243                                 #endif
244
245                                 If(depthPass || Bool(earlyDepthTest))
246                                 {
247                                         for(unsigned int q = 0; q < state.multiSample; q++)
248                                         {
249                                                 if(state.multiSampleMask & (1 << q))
250                                                 {
251                                                         writeDepth(zBuffer, q, x, z[q], zMask[q]);
252
253                                                         if(state.occlusionEnabled)
254                                                         {
255                                                                 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
256                                                         }
257                                                 }
258                                         }
259
260                                         if(colorUsed())
261                                         {
262                                                 #if PERF_PROFILE
263                                                         AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
264                                                 #endif
265
266                                                 rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
267                                         }
268                                 }
269
270                                 #if PERF_PROFILE
271                                         cycles[PERF_ROP] += Ticks() - ropTime;
272                                 #endif
273                         }
274                 }
275
276                 for(unsigned int q = 0; q < state.multiSample; q++)
277                 {
278                         if(state.multiSampleMask & (1 << q))
279                         {
280                                 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
281                         }
282                 }
283
284                 #if PERF_PROFILE
285                         cycles[PERF_PIPE] += Ticks() - pipeTime;
286                 #endif
287         }
288
289         Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
290         {
291                 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
292
293                 if(!flat)
294                 {
295                         interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
296                                        y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
297
298                         if(perspective)
299                         {
300                                 interpolant *= rhw;
301                         }
302                 }
303
304                 return interpolant;
305         }
306
307         void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
308         {
309                 if(!state.stencilActive)
310                 {
311                         return;
312                 }
313
314                 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
315
316                 Pointer<Byte> buffer = sBuffer + 2 * x;
317
318                 if(q > 0)
319                 {
320                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
321                 }
322
323                 Byte8 value = *Pointer<Byte8>(buffer);
324                 Byte8 valueCCW = value;
325
326                 if(!state.noStencilMask)
327                 {
328                         value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
329                 }
330
331                 stencilTest(value, state.stencilCompareMode, false);
332
333                 if(state.twoSidedStencil)
334                 {
335                         if(!state.noStencilMaskCCW)
336                         {
337                                 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
338                         }
339
340                         stencilTest(valueCCW, state.stencilCompareModeCCW, true);
341
342                         value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
343                         valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
344                         value |= valueCCW;
345                 }
346
347                 sMask = SignMask(value) & cMask;
348         }
349
350         void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
351         {
352                 Byte8 equal;
353
354                 switch(stencilCompareMode)
355                 {
356                 case STENCIL_ALWAYS:
357                         value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
358                         break;
359                 case STENCIL_NEVER:
360                         value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
361                         break;
362                 case STENCIL_LESS:                      // a < b ~ b > a
363                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
364                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
365                         break;
366                 case STENCIL_EQUAL:
367                         value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
368                         break;
369                 case STENCIL_NOTEQUAL:          // a != b ~ !(a == b)
370                         value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
371                         value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
372                         break;
373                 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
374                         equal = value;
375                         equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
376                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
377                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
378                         value |= equal;
379                         break;
380                 case STENCIL_GREATER:           // a > b
381                         equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
382                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
383                         equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
384                         value = equal;
385                         break;
386                 case STENCIL_GREATEREQUAL:      // a >= b ~ !(a < b) ~ !(b > a)
387                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
388                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
389                         value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
390                         break;
391                 default:
392                         ASSERT(false);
393                 }
394         }
395
396         Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
397         {
398                 if(!state.depthTestActive)
399                 {
400                         return true;
401                 }
402
403                 Float4 Z = z;
404
405                 if(shader && shader->depthOverride())
406                 {
407                         if(complementaryDepthBuffer)
408                         {
409                                 Z = Float4(1.0f) - oDepth;
410                         }
411                         else
412                         {
413                                 Z = oDepth;
414                         }
415                 }
416
417                 Pointer<Byte> buffer;
418                 Int pitch;
419
420                 if(!state.quadLayoutDepthBuffer)
421                 {
422                         buffer = zBuffer + 4 * x;
423                         pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
424                 }
425                 else
426                 {
427                         buffer = zBuffer + 8 * x;
428                 }
429
430                 if(q > 0)
431                 {
432                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
433                 }
434
435                 Float4 zValue;
436
437                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
438                 {
439                         if(!state.quadLayoutDepthBuffer)
440                         {
441                                 // FIXME: Properly optimizes?
442                                 zValue.xy = *Pointer<Float4>(buffer);
443                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
444                         }
445                         else
446                         {
447                                 zValue = *Pointer<Float4>(buffer, 16);
448                         }
449                 }
450
451                 Int4 zTest;
452
453                 switch(state.depthCompareMode)
454                 {
455                 case DEPTH_ALWAYS:
456                         // Optimized
457                         break;
458                 case DEPTH_NEVER:
459                         // Optimized
460                         break;
461                 case DEPTH_EQUAL:
462                         zTest = CmpEQ(zValue, Z);
463                         break;
464                 case DEPTH_NOTEQUAL:
465                         zTest = CmpNEQ(zValue, Z);
466                         break;
467                 case DEPTH_LESS:
468                         if(complementaryDepthBuffer)
469                         {
470                                 zTest = CmpLT(zValue, Z);
471                         }
472                         else
473                         {
474                                 zTest = CmpNLE(zValue, Z);
475                         }
476                         break;
477                 case DEPTH_GREATEREQUAL:
478                         if(complementaryDepthBuffer)
479                         {
480                                 zTest = CmpNLT(zValue, Z);
481                         }
482                         else
483                         {
484                                 zTest = CmpLE(zValue, Z);
485                         }
486                         break;
487                 case DEPTH_LESSEQUAL:
488                         if(complementaryDepthBuffer)
489                         {
490                                 zTest = CmpLE(zValue, Z);
491                         }
492                         else
493                         {
494                                 zTest = CmpNLT(zValue, Z);
495                         }
496                         break;
497                 case DEPTH_GREATER:
498                         if(complementaryDepthBuffer)
499                         {
500                                 zTest = CmpNLE(zValue, Z);
501                         }
502                         else
503                         {
504                                 zTest = CmpLT(zValue, Z);
505                         }
506                         break;
507                 default:
508                         ASSERT(false);
509                 }
510
511                 switch(state.depthCompareMode)
512                 {
513                 case DEPTH_ALWAYS:
514                         zMask = cMask;
515                         break;
516                 case DEPTH_NEVER:
517                         zMask = 0x0;
518                         break;
519                 default:
520                         zMask = SignMask(zTest) & cMask;
521                         break;
522                 }
523
524                 if(state.stencilActive)
525                 {
526                         zMask &= sMask;
527                 }
528
529                 return zMask != 0;
530         }
531
532         void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
533         {
534                 Short4 cmp;
535                 Short4 equal;
536
537                 switch(state.alphaCompareMode)
538                 {
539                 case ALPHA_ALWAYS:
540                         aMask = 0xF;
541                         break;
542                 case ALPHA_NEVER:
543                         aMask = 0x0;
544                         break;
545                 case ALPHA_EQUAL:
546                         cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
547                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
548                         break;
549                 case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
550                         cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
551                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
552                         break;
553                 case ALPHA_LESS:           // a < b ~ b > a
554                         cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
555                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
556                         break;
557                 case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
558                         equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
559                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
560                         cmp |= equal;
561                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
562                         break;
563                 case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
564                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
565                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
566                         break;
567                 case ALPHA_GREATER:        // a > b
568                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
569                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
570                         break;
571                 default:
572                         ASSERT(false);
573                 }
574         }
575
576         void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
577         {
578                 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
579                 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
580                 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
581                 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
582
583                 Int aMask0 = SignMask(coverage0);
584                 Int aMask1 = SignMask(coverage1);
585                 Int aMask2 = SignMask(coverage2);
586                 Int aMask3 = SignMask(coverage3);
587
588                 cMask[0] &= aMask0;
589                 cMask[1] &= aMask1;
590                 cMask[2] &= aMask2;
591                 cMask[3] &= aMask3;
592         }
593
594         void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
595         {
596                 if(!state.fogActive)
597                 {
598                         return;
599                 }
600
601                 if(state.pixelFogMode != FOG_NONE)
602                 {
603                         pixelFog(fog);
604
605                         fog = Min(fog, Float4(1.0f));
606                         fog = Max(fog, Float4(0.0f));
607                 }
608
609                 c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
610                 c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
611                 c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
612
613                 c0.x *= fog;
614                 c0.y *= fog;
615                 c0.z *= fog;
616
617                 c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
618                 c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
619                 c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
620         }
621
622         void PixelRoutine::pixelFog(Float4 &visibility)
623         {
624                 Float4 &zw = visibility;
625
626                 if(state.pixelFogMode != FOG_NONE)
627                 {
628                         if(state.wBasedFog)
629                         {
630                                 zw = rhw;
631                         }
632                         else
633                         {
634                                 if(complementaryDepthBuffer)
635                                 {
636                                         zw = Float4(1.0f) - z[0];
637                                 }
638                                 else
639                                 {
640                                         zw = z[0];
641                                 }
642                         }
643                 }
644
645                 switch(state.pixelFogMode)
646                 {
647                 case FOG_NONE:
648                         break;
649                 case FOG_LINEAR:
650                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
651                         zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
652                         break;
653                 case FOG_EXP:
654                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
655                         zw = exponential2(zw, true);
656                         break;
657                 case FOG_EXP2:
658                         zw *= zw;
659                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
660                         zw = exponential2(zw, true);
661                         break;
662                 default:
663                         ASSERT(false);
664                 }
665         }
666
667         void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
668         {
669                 if(!state.depthWriteEnable)
670                 {
671                         return;
672                 }
673
674                 Float4 Z = z;
675
676                 if(shader && shader->depthOverride())
677                 {
678                         if(complementaryDepthBuffer)
679                         {
680                                 Z = Float4(1.0f) - oDepth;
681                         }
682                         else
683                         {
684                                 Z = oDepth;
685                         }
686                 }
687
688                 Pointer<Byte> buffer;
689                 Int pitch;
690
691                 if(!state.quadLayoutDepthBuffer)
692                 {
693                         buffer = zBuffer + 4 * x;
694                         pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
695                 }
696                 else
697                 {
698                         buffer = zBuffer + 8 * x;
699                 }
700
701                 if(q > 0)
702                 {
703                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
704                 }
705
706                 Float4 zValue;
707
708                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
709                 {
710                         if(!state.quadLayoutDepthBuffer)
711                         {
712                                 // FIXME: Properly optimizes?
713                                 zValue.xy = *Pointer<Float4>(buffer);
714                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
715                         }
716                         else
717                         {
718                                 zValue = *Pointer<Float4>(buffer, 16);
719                         }
720                 }
721
722                 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
723                 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
724                 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
725
726                 if(!state.quadLayoutDepthBuffer)
727                 {
728                         // FIXME: Properly optimizes?
729                         *Pointer<Float2>(buffer) = Float2(Z.xy);
730                         *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
731                 }
732                 else
733                 {
734                         *Pointer<Float4>(buffer, 16) = Z;
735                 }
736         }
737
738         void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
739         {
740                 if(!state.stencilActive)
741                 {
742                         return;
743                 }
744
745                 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
746                 {
747                         if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
748                         {
749                                 return;
750                         }
751                 }
752
753                 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
754                 {
755                         return;
756                 }
757
758                 Pointer<Byte> buffer = sBuffer + 2 * x;
759
760                 if(q > 0)
761                 {
762                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
763                 }
764
765                 Byte8 bufferValue = *Pointer<Byte8>(buffer);
766
767                 Byte8 newValue;
768                 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
769
770                 if(!state.noStencilWriteMask)
771                 {
772                         Byte8 maskedValue = bufferValue;
773                         newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
774                         maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
775                         newValue |= maskedValue;
776                 }
777
778                 if(state.twoSidedStencil)
779                 {
780                         Byte8 newValueCCW;
781
782                         stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
783
784                         if(!state.noStencilWriteMaskCCW)
785                         {
786                                 Byte8 maskedValue = bufferValue;
787                                 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
788                                 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
789                                 newValueCCW |= maskedValue;
790                         }
791
792                         newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
793                         newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
794                         newValue |= newValueCCW;
795                 }
796
797                 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
798                 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
799                 newValue |= bufferValue;
800
801                 *Pointer<Byte4>(buffer) = Byte4(newValue);
802         }
803
804         void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
805         {
806                 Byte8 &pass = newValue;
807                 Byte8 fail;
808                 Byte8 zFail;
809
810                 stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
811
812                 if(stencilZFailOperation != stencilPassOperation)
813                 {
814                         stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
815                 }
816
817                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
818                 {
819                         stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
820                 }
821
822                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
823                 {
824                         if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
825                         {
826                                 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
827                                 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
828                                 pass |= zFail;
829                         }
830
831                         pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
832                         fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
833                         pass |= fail;
834                 }
835         }
836
837         void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
838         {
839                 switch(operation)
840                 {
841                 case OPERATION_KEEP:
842                         output = bufferValue;
843                         break;
844                 case OPERATION_ZERO:
845                         output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
846                         break;
847                 case OPERATION_REPLACE:
848                         output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
849                         break;
850                 case OPERATION_INCRSAT:
851                         output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
852                         break;
853                 case OPERATION_DECRSAT:
854                         output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
855                         break;
856                 case OPERATION_INVERT:
857                         output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
858                         break;
859                 case OPERATION_INCR:
860                         output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
861                         break;
862                 case OPERATION_DECR:
863                         output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
864                         break;
865                 default:
866                         ASSERT(false);
867                 }
868         }
869
870         void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
871         {
872                 switch(blendFactorActive)
873                 {
874                 case BLEND_ZERO:
875                         // Optimized
876                         break;
877                 case BLEND_ONE:
878                         // Optimized
879                         break;
880                 case BLEND_SOURCE:
881                         blendFactor.x = current.x;
882                         blendFactor.y = current.y;
883                         blendFactor.z = current.z;
884                         break;
885                 case BLEND_INVSOURCE:
886                         blendFactor.x = Short4(0xFFFFu) - current.x;
887                         blendFactor.y = Short4(0xFFFFu) - current.y;
888                         blendFactor.z = Short4(0xFFFFu) - current.z;
889                         break;
890                 case BLEND_DEST:
891                         blendFactor.x = pixel.x;
892                         blendFactor.y = pixel.y;
893                         blendFactor.z = pixel.z;
894                         break;
895                 case BLEND_INVDEST:
896                         blendFactor.x = Short4(0xFFFFu) - pixel.x;
897                         blendFactor.y = Short4(0xFFFFu) - pixel.y;
898                         blendFactor.z = Short4(0xFFFFu) - pixel.z;
899                         break;
900                 case BLEND_SOURCEALPHA:
901                         blendFactor.x = current.w;
902                         blendFactor.y = current.w;
903                         blendFactor.z = current.w;
904                         break;
905                 case BLEND_INVSOURCEALPHA:
906                         blendFactor.x = Short4(0xFFFFu) - current.w;
907                         blendFactor.y = Short4(0xFFFFu) - current.w;
908                         blendFactor.z = Short4(0xFFFFu) - current.w;
909                         break;
910                 case BLEND_DESTALPHA:
911                         blendFactor.x = pixel.w;
912                         blendFactor.y = pixel.w;
913                         blendFactor.z = pixel.w;
914                         break;
915                 case BLEND_INVDESTALPHA:
916                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
917                         blendFactor.y = Short4(0xFFFFu) - pixel.w;
918                         blendFactor.z = Short4(0xFFFFu) - pixel.w;
919                         break;
920                 case BLEND_SRCALPHASAT:
921                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
922                         blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
923                         blendFactor.y = blendFactor.x;
924                         blendFactor.z = blendFactor.x;
925                         break;
926                 case BLEND_CONSTANT:
927                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
928                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
929                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
930                         break;
931                 case BLEND_INVCONSTANT:
932                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
933                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
934                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
935                         break;
936                 case BLEND_CONSTANTALPHA:
937                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
938                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
940                         break;
941                 case BLEND_INVCONSTANTALPHA:
942                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
943                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
944                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
945                         break;
946                 default:
947                         ASSERT(false);
948                 }
949         }
950
951         void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
952         {
953                 switch(blendFactorAlphaActive)
954                 {
955                 case BLEND_ZERO:
956                         // Optimized
957                         break;
958                 case BLEND_ONE:
959                         // Optimized
960                         break;
961                 case BLEND_SOURCE:
962                         blendFactor.w = current.w;
963                         break;
964                 case BLEND_INVSOURCE:
965                         blendFactor.w = Short4(0xFFFFu) - current.w;
966                         break;
967                 case BLEND_DEST:
968                         blendFactor.w = pixel.w;
969                         break;
970                 case BLEND_INVDEST:
971                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
972                         break;
973                 case BLEND_SOURCEALPHA:
974                         blendFactor.w = current.w;
975                         break;
976                 case BLEND_INVSOURCEALPHA:
977                         blendFactor.w = Short4(0xFFFFu) - current.w;
978                         break;
979                 case BLEND_DESTALPHA:
980                         blendFactor.w = pixel.w;
981                         break;
982                 case BLEND_INVDESTALPHA:
983                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
984                         break;
985                 case BLEND_SRCALPHASAT:
986                         blendFactor.w = Short4(0xFFFFu);
987                         break;
988                 case BLEND_CONSTANT:
989                 case BLEND_CONSTANTALPHA:
990                         blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
991                         break;
992                 case BLEND_INVCONSTANT:
993                 case BLEND_INVCONSTANTALPHA:
994                         blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
995                         break;
996                 default:
997                         ASSERT(false);
998                 }
999         }
1000
1001         bool PixelRoutine::isSRGB(int index) const
1002         {
1003                 return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
1004         }
1005
1006         void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1007         {
1008                 Short4 c01;
1009                 Short4 c23;
1010                 Pointer<Byte> buffer;
1011                 Pointer<Byte> buffer2;
1012
1013                 switch(state.targetFormat[index])
1014                 {
1015                 case FORMAT_R5G6B5:
1016                         buffer = cBuffer + 2 * x;
1017                         buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1018                         c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1019
1020                         pixel.x = c01 & Short4(0xF800u);
1021                         pixel.y = (c01 & Short4(0x07E0u)) << 5;
1022                         pixel.z = (c01 & Short4(0x001Fu)) << 11;
1023                         pixel.w = Short4(0xFFFFu);
1024                         break;
1025                 case FORMAT_A8R8G8B8:
1026                         buffer = cBuffer + 4 * x;
1027                         c01 = *Pointer<Short4>(buffer);
1028                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1029                         c23 = *Pointer<Short4>(buffer);
1030                         pixel.z = c01;
1031                         pixel.y = c01;
1032                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1033                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1034                         pixel.x = pixel.z;
1035                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1036                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1037                         pixel.y = pixel.z;
1038                         pixel.w = pixel.x;
1039                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1040                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1041                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1042                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1043                         break;
1044                 case FORMAT_A8B8G8R8:
1045                 case FORMAT_SRGB8_A8:
1046                         buffer = cBuffer + 4 * x;
1047                         c01 = *Pointer<Short4>(buffer);
1048                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1049                         c23 = *Pointer<Short4>(buffer);
1050                         pixel.z = c01;
1051                         pixel.y = c01;
1052                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1053                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1054                         pixel.x = pixel.z;
1055                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1056                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1057                         pixel.y = pixel.z;
1058                         pixel.w = pixel.x;
1059                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1060                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1061                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1062                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1063                         break;
1064                 case FORMAT_A8:
1065                         buffer = cBuffer + 1 * x;
1066                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1067                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1068                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1069                         pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1070                         pixel.x = Short4(0x0000);
1071                         pixel.y = Short4(0x0000);
1072                         pixel.z = Short4(0x0000);
1073                         break;
1074                 case FORMAT_X8R8G8B8:
1075                         buffer = cBuffer + 4 * x;
1076                         c01 = *Pointer<Short4>(buffer);
1077                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1078                         c23 = *Pointer<Short4>(buffer);
1079                         pixel.z = c01;
1080                         pixel.y = c01;
1081                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1082                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1083                         pixel.x = pixel.z;
1084                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1085                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1086                         pixel.y = pixel.z;
1087                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1088                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1089                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1090                         pixel.w = Short4(0xFFFFu);
1091                         break;
1092                 case FORMAT_X8B8G8R8:
1093                 case FORMAT_SRGB8_X8:
1094                         buffer = cBuffer + 4 * x;
1095                         c01 = *Pointer<Short4>(buffer);
1096                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1097                         c23 = *Pointer<Short4>(buffer);
1098                         pixel.z = c01;
1099                         pixel.y = c01;
1100                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1101                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1102                         pixel.x = pixel.z;
1103                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1104                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1105                         pixel.y = pixel.z;
1106                         pixel.w = pixel.x;
1107                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1108                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1109                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1110                         pixel.w = Short4(0xFFFFu);
1111                         break;
1112                 case FORMAT_A8G8R8B8Q:
1113                         UNIMPLEMENTED();
1114                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1115                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1116                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1117                 //      pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1118                         break;
1119                 case FORMAT_X8G8R8B8Q:
1120                         UNIMPLEMENTED();
1121                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1122                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1123                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1124                 //      pixel.w = Short4(0xFFFFu);
1125                         break;
1126                 case FORMAT_A16B16G16R16:
1127                         buffer = cBuffer;
1128                         pixel.x = *Pointer<Short4>(buffer + 8 * x);
1129                         pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1130                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1131                         pixel.z = *Pointer<Short4>(buffer + 8 * x);
1132                         pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1133                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1134                         break;
1135                 case FORMAT_G16R16:
1136                         buffer = cBuffer;
1137                         pixel.x = *Pointer<Short4>(buffer + 4 * x);
1138                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1139                         pixel.y = *Pointer<Short4>(buffer + 4 * x);
1140                         pixel.z = pixel.x;
1141                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1142                         pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1143                         pixel.y = pixel.z;
1144                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1145                         pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1146                         pixel.z = Short4(0xFFFFu);
1147                         pixel.w = Short4(0xFFFFu);
1148                         break;
1149                 default:
1150                         ASSERT(false);
1151                 }
1152
1153                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1154                 {
1155                         sRGBtoLinear16_12_16(pixel);
1156                 }
1157         }
1158
1159         void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1160         {
1161                 if(!state.alphaBlendActive)
1162                 {
1163                         return;
1164                 }
1165
1166                 Vector4s pixel;
1167                 readPixel(index, cBuffer, x, pixel);
1168
1169                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1170                 Vector4s sourceFactor;
1171                 Vector4s destFactor;
1172
1173                 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1174                 blendFactor(destFactor, current, pixel, state.destBlendFactor);
1175
1176                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1177                 {
1178                         current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1179                         current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1180                         current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1181                 }
1182
1183                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1184                 {
1185                         pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1186                         pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1187                         pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1188                 }
1189
1190                 switch(state.blendOperation)
1191                 {
1192                 case BLENDOP_ADD:
1193                         current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1194                         current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1195                         current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1196                         break;
1197                 case BLENDOP_SUB:
1198                         current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1199                         current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1200                         current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1201                         break;
1202                 case BLENDOP_INVSUB:
1203                         current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1204                         current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1205                         current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1206                         break;
1207                 case BLENDOP_MIN:
1208                         current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1209                         current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1210                         current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1211                         break;
1212                 case BLENDOP_MAX:
1213                         current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1214                         current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1215                         current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1216                         break;
1217                 case BLENDOP_SOURCE:
1218                         // No operation
1219                         break;
1220                 case BLENDOP_DEST:
1221                         current.x = pixel.x;
1222                         current.y = pixel.y;
1223                         current.z = pixel.z;
1224                         break;
1225                 case BLENDOP_NULL:
1226                         current.x = Short4(0x0000);
1227                         current.y = Short4(0x0000);
1228                         current.z = Short4(0x0000);
1229                         break;
1230                 default:
1231                         ASSERT(false);
1232                 }
1233
1234                 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1235                 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1236
1237                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1238                 {
1239                         current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1240                 }
1241
1242                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1243                 {
1244                         pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1245                 }
1246
1247                 switch(state.blendOperationAlpha)
1248                 {
1249                 case BLENDOP_ADD:
1250                         current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1251                         break;
1252                 case BLENDOP_SUB:
1253                         current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1254                         break;
1255                 case BLENDOP_INVSUB:
1256                         current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1257                         break;
1258                 case BLENDOP_MIN:
1259                         current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1260                         break;
1261                 case BLENDOP_MAX:
1262                         current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1263                         break;
1264                 case BLENDOP_SOURCE:
1265                         // No operation
1266                         break;
1267                 case BLENDOP_DEST:
1268                         current.w = pixel.w;
1269                         break;
1270                 case BLENDOP_NULL:
1271                         current.w = Short4(0x0000);
1272                         break;
1273                 default:
1274                         ASSERT(false);
1275                 }
1276         }
1277
1278         void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1279         {
1280                 if(state.logicalOperation == LOGICALOP_COPY)
1281                 {
1282                         return;
1283                 }
1284
1285                 Vector4s pixel;
1286                 readPixel(index, cBuffer, x, pixel);
1287
1288                 switch(state.logicalOperation)
1289                 {
1290                 case LOGICALOP_CLEAR:
1291                         current.x = UShort4(0);
1292                         current.y = UShort4(0);
1293                         current.z = UShort4(0);
1294                         break;
1295                 case LOGICALOP_SET:
1296                         current.x = UShort4(0xFFFFu);
1297                         current.y = UShort4(0xFFFFu);
1298                         current.z = UShort4(0xFFFFu);
1299                         break;
1300                 case LOGICALOP_COPY:
1301                         ASSERT(false);   // Optimized out
1302                         break;
1303                 case LOGICALOP_COPY_INVERTED:
1304                         current.x = ~current.x;
1305                         current.y = ~current.y;
1306                         current.z = ~current.z;
1307                         break;
1308                 case LOGICALOP_NOOP:
1309                         current.x = pixel.x;
1310                         current.y = pixel.y;
1311                         current.z = pixel.z;
1312                         break;
1313                 case LOGICALOP_INVERT:
1314                         current.x = ~pixel.x;
1315                         current.y = ~pixel.y;
1316                         current.z = ~pixel.z;
1317                         break;
1318                 case LOGICALOP_AND:
1319                         current.x = pixel.x & current.x;
1320                         current.y = pixel.y & current.y;
1321                         current.z = pixel.z & current.z;
1322                         break;
1323                 case LOGICALOP_NAND:
1324                         current.x = ~(pixel.x & current.x);
1325                         current.y = ~(pixel.y & current.y);
1326                         current.z = ~(pixel.z & current.z);
1327                         break;
1328                 case LOGICALOP_OR:
1329                         current.x = pixel.x | current.x;
1330                         current.y = pixel.y | current.y;
1331                         current.z = pixel.z | current.z;
1332                         break;
1333                 case LOGICALOP_NOR:
1334                         current.x = ~(pixel.x | current.x);
1335                         current.y = ~(pixel.y | current.y);
1336                         current.z = ~(pixel.z | current.z);
1337                         break;
1338                 case LOGICALOP_XOR:
1339                         current.x = pixel.x ^ current.x;
1340                         current.y = pixel.y ^ current.y;
1341                         current.z = pixel.z ^ current.z;
1342                         break;
1343                 case LOGICALOP_EQUIV:
1344                         current.x = ~(pixel.x ^ current.x);
1345                         current.y = ~(pixel.y ^ current.y);
1346                         current.z = ~(pixel.z ^ current.z);
1347                         break;
1348                 case LOGICALOP_AND_REVERSE:
1349                         current.x = ~pixel.x & current.x;
1350                         current.y = ~pixel.y & current.y;
1351                         current.z = ~pixel.z & current.z;
1352                         break;
1353                 case LOGICALOP_AND_INVERTED:
1354                         current.x = pixel.x & ~current.x;
1355                         current.y = pixel.y & ~current.y;
1356                         current.z = pixel.z & ~current.z;
1357                         break;
1358                 case LOGICALOP_OR_REVERSE:
1359                         current.x = ~pixel.x | current.x;
1360                         current.y = ~pixel.y | current.y;
1361                         current.z = ~pixel.z | current.z;
1362                         break;
1363                 case LOGICALOP_OR_INVERTED:
1364                         current.x = pixel.x | ~current.x;
1365                         current.y = pixel.y | ~current.y;
1366                         current.z = pixel.z | ~current.z;
1367                         break;
1368                 default:
1369                         ASSERT(false);
1370                 }
1371         }
1372
1373         void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1374         {
1375                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1376                 {
1377                         linearToSRGB16_12_16(current);
1378                 }
1379
1380                 if(exactColorRounding)
1381                 {
1382                         switch(state.targetFormat[index])
1383                         {
1384                         case FORMAT_R5G6B5:
1385                                 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1386                                 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1387                                 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1388                                 break;
1389                         case FORMAT_X8G8R8B8Q:
1390                         case FORMAT_A8G8R8B8Q:
1391                         case FORMAT_X8R8G8B8:
1392                         case FORMAT_X8B8G8R8:
1393                         case FORMAT_A8R8G8B8:
1394                         case FORMAT_A8B8G8R8:
1395                         case FORMAT_SRGB8_X8:
1396                         case FORMAT_SRGB8_A8:
1397                         case FORMAT_G8R8:
1398                         case FORMAT_R8:
1399                                 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1400                                 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1401                                 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1402                                 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1403                                 break;
1404                         default:
1405                                 break;
1406                         }
1407                 }
1408
1409                 int rgbaWriteMask = state.colorWriteActive(index);
1410                 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1411
1412                 switch(state.targetFormat[index])
1413                 {
1414                 case FORMAT_R5G6B5:
1415                         {
1416                                 current.x = current.x & Short4(0xF800u);
1417                                 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1418                                 current.z = As<UShort4>(current.z) >> 11;
1419
1420                                 current.x = current.x | current.y | current.z;
1421                         }
1422                         break;
1423                 case FORMAT_X8G8R8B8Q:
1424                         UNIMPLEMENTED();
1425                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1426                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1427                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1428
1429                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1430                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1431                         break;
1432                 case FORMAT_A8G8R8B8Q:
1433                         UNIMPLEMENTED();
1434                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1435                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1436                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1437                 //      current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1438
1439                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1440                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1441                         break;
1442                 case FORMAT_X8R8G8B8:
1443                 case FORMAT_A8R8G8B8:
1444                         if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1445                         {
1446                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449
1450                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1451                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1452
1453                                 current.x = current.z;
1454                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1455                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1456                                 current.y = current.z;
1457                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1458                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1459                         }
1460                         else
1461                         {
1462                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1463                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1464                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1465                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1466
1467                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1468                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1469
1470                                 current.x = current.z;
1471                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1472                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1473                                 current.y = current.z;
1474                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1475                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1476                         }
1477                         break;
1478                 case FORMAT_X8B8G8R8:
1479                 case FORMAT_A8B8G8R8:
1480                 case FORMAT_SRGB8_X8:
1481                 case FORMAT_SRGB8_A8:
1482                         if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1483                         {
1484                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1485                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1486                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1487
1488                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1489                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1490
1491                                 current.x = current.z;
1492                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1493                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1494                                 current.y = current.z;
1495                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1496                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1497                         }
1498                         else
1499                         {
1500                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1501                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1502                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1503                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1504
1505                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1506                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1507
1508                                 current.x = current.z;
1509                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1510                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1511                                 current.y = current.z;
1512                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1513                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1514                         }
1515                         break;
1516                 case FORMAT_G8R8:
1517                         current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1518                         current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1519                         current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1520                         current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1521                         current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1522                         break;
1523                 case FORMAT_R8:
1524                         current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1525                         current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1526                         break;
1527                 case FORMAT_A8:
1528                         current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1529                         current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1530                         break;
1531                 case FORMAT_G16R16:
1532                         current.z = current.x;
1533                         current.x = As<Short4>(UnpackLow(current.x, current.y));
1534                         current.z = As<Short4>(UnpackHigh(current.z, current.y));
1535                         current.y = current.z;
1536                         break;
1537                 case FORMAT_A16B16G16R16:
1538                         transpose4x4(current.x, current.y, current.z, current.w);
1539                         break;
1540                 default:
1541                         ASSERT(false);
1542                 }
1543
1544                 Short4 c01 = current.z;
1545                 Short4 c23 = current.y;
1546
1547                 Int xMask;   // Combination of all masks
1548
1549                 if(state.depthTestActive)
1550                 {
1551                         xMask = zMask;
1552                 }
1553                 else
1554                 {
1555                         xMask = cMask;
1556                 }
1557
1558                 if(state.stencilActive)
1559                 {
1560                         xMask &= sMask;
1561                 }
1562
1563                 switch(state.targetFormat[index])
1564                 {
1565                 case FORMAT_R5G6B5:
1566                         {
1567                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1568                                 Int value = *Pointer<Int>(buffer);
1569
1570                                 Int c01 = Extract(As<Int2>(current.x), 0);
1571
1572                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1573                                 {
1574                                         Int masked = value;
1575                                         c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1576                                         masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1577                                         c01 |= masked;
1578                                 }
1579
1580                                 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1581                                 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1582                                 c01 |= value;
1583                                 *Pointer<Int>(buffer) = c01;
1584
1585                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1586                                 value = *Pointer<Int>(buffer);
1587
1588                                 Int c23 = Extract(As<Int2>(current.x), 1);
1589
1590                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1591                                 {
1592                                         Int masked = value;
1593                                         c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1594                                         masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1595                                         c23 |= masked;
1596                                 }
1597
1598                                 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1599                                 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1600                                 c23 |= value;
1601                                 *Pointer<Int>(buffer) = c23;
1602                         }
1603                         break;
1604                 case FORMAT_A8G8R8B8Q:
1605                 case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1606                         UNIMPLEMENTED();
1607                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1608
1609                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1610                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1611                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1612                 //      {
1613                 //              Short4 masked = value;
1614                 //              c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1615                 //              masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1616                 //              c01 |= masked;
1617                 //      }
1618
1619                 //      c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1620                 //      value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1621                 //      c01 |= value;
1622                 //      *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1623
1624                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1625
1626                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1627                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1628                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1629                 //      {
1630                 //              Short4 masked = value;
1631                 //              c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1632                 //              masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1633                 //              c23 |= masked;
1634                 //      }
1635
1636                 //      c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1637                 //      value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1638                 //      c23 |= value;
1639                 //      *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1640                         break;
1641                 case FORMAT_A8R8G8B8:
1642                 case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1643                         {
1644                                 Pointer<Byte> buffer = cBuffer + x * 4;
1645                                 Short4 value = *Pointer<Short4>(buffer);
1646
1647                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1648                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1649                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1650                                 {
1651                                         Short4 masked = value;
1652                                         c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1653                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1654                                         c01 |= masked;
1655                                 }
1656
1657                                 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1658                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1659                                 c01 |= value;
1660                                 *Pointer<Short4>(buffer) = c01;
1661
1662                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1663                                 value = *Pointer<Short4>(buffer);
1664
1665                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1666                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1667                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1668                                 {
1669                                         Short4 masked = value;
1670                                         c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1671                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1672                                         c23 |= masked;
1673                                 }
1674
1675                                 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1676                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1677                                 c23 |= value;
1678                                 *Pointer<Short4>(buffer) = c23;
1679                         }
1680                         break;
1681                 case FORMAT_A8B8G8R8:
1682                 case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1683                 case FORMAT_SRGB8_X8:
1684                 case FORMAT_SRGB8_A8:
1685                         {
1686                                 Pointer<Byte> buffer = cBuffer + x * 4;
1687                                 Short4 value = *Pointer<Short4>(buffer);
1688
1689                                 bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1690                                               (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1691                                                ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1692
1693                                 if(masked)
1694                                 {
1695                                         Short4 masked = value;
1696                                         c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1697                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1698                                         c01 |= masked;
1699                                 }
1700
1701                                 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1702                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1703                                 c01 |= value;
1704                                 *Pointer<Short4>(buffer) = c01;
1705
1706                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1707                                 value = *Pointer<Short4>(buffer);
1708
1709                                 if(masked)
1710                                 {
1711                                         Short4 masked = value;
1712                                         c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1713                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1714                                         c23 |= masked;
1715                                 }
1716
1717                                 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1718                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1719                                 c23 |= value;
1720                                 *Pointer<Short4>(buffer) = c23;
1721                         }
1722                         break;
1723                 case FORMAT_G8R8:
1724                         if((rgbaWriteMask & 0x00000003) != 0x0)
1725                         {
1726                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1727                                 Int2 value;
1728                                 value = Insert(value, *Pointer<Int>(buffer), 0);
1729                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1730                                 value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1731
1732                                 Int2 packedCol = As<Int2>(current.x);
1733
1734                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1735                                 if((rgbaWriteMask & 0x3) != 0x3)
1736                                 {
1737                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1738                                         UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1739                                         mergedMask &= rgbaMask;
1740                                 }
1741
1742                                 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1743
1744                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1745                                 *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1746                         }
1747                         break;
1748                 case FORMAT_R8:
1749                         if(rgbaWriteMask & 0x00000001)
1750                         {
1751                                 Pointer<Byte> buffer = cBuffer + 1 * x;
1752                                 Short4 value;
1753                                 value = Insert(value, *Pointer<Short>(buffer), 0);
1754                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1755                                 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1756                                 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1757
1758                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1759                                 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1760                                 current.x |= value;
1761
1762                                 *Pointer<Short>(buffer) = Extract(current.x, 0);
1763                                 *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1764                         }
1765                         break;
1766                 case FORMAT_A8:
1767                         if(rgbaWriteMask & 0x00000008)
1768                         {
1769                                 Pointer<Byte> buffer = cBuffer + 1 * x;
1770                                 Short4 value;
1771                                 value = Insert(value, *Pointer<Short>(buffer), 0);
1772                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1773                                 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1774                                 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1775
1776                                 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1777                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1778                                 current.w |= value;
1779
1780                                 *Pointer<Short>(buffer) = Extract(current.w, 0);
1781                                 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1782                         }
1783                         break;
1784                 case FORMAT_G16R16:
1785                         {
1786                                 Pointer<Byte> buffer = cBuffer + 4 * x;
1787
1788                                 Short4 value = *Pointer<Short4>(buffer);
1789
1790                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1791                                 {
1792                                         Short4 masked = value;
1793                                         current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1794                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1795                                         current.x |= masked;
1796                                 }
1797
1798                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1799                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1800                                 current.x |= value;
1801                                 *Pointer<Short4>(buffer) = current.x;
1802
1803                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1804
1805                                 value = *Pointer<Short4>(buffer);
1806
1807                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1808                                 {
1809                                         Short4 masked = value;
1810                                         current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1811                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1812                                         current.y |= masked;
1813                                 }
1814
1815                                 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1816                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1817                                 current.y |= value;
1818                                 *Pointer<Short4>(buffer) = current.y;
1819                         }
1820                         break;
1821                 case FORMAT_A16B16G16R16:
1822                         {
1823                                 Pointer<Byte> buffer = cBuffer + 8 * x;
1824
1825                                 {
1826                                         Short4 value = *Pointer<Short4>(buffer);
1827
1828                                         if(rgbaWriteMask != 0x0000000F)
1829                                         {
1830                                                 Short4 masked = value;
1831                                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1832                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1833                                                 current.x |= masked;
1834                                         }
1835
1836                                         current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1837                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1838                                         current.x |= value;
1839                                         *Pointer<Short4>(buffer) = current.x;
1840                                 }
1841
1842                                 {
1843                                         Short4 value = *Pointer<Short4>(buffer + 8);
1844
1845                                         if(rgbaWriteMask != 0x0000000F)
1846                                         {
1847                                                 Short4 masked = value;
1848                                                 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1849                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1850                                                 current.y |= masked;
1851                                         }
1852
1853                                         current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1854                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1855                                         current.y |= value;
1856                                         *Pointer<Short4>(buffer + 8) = current.y;
1857                                 }
1858
1859                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1860
1861                                 {
1862                                         Short4 value = *Pointer<Short4>(buffer);
1863
1864                                         if(rgbaWriteMask != 0x0000000F)
1865                                         {
1866                                                 Short4 masked = value;
1867                                                 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1868                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1869                                                 current.z |= masked;
1870                                         }
1871
1872                                         current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1873                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1874                                         current.z |= value;
1875                                         *Pointer<Short4>(buffer) = current.z;
1876                                 }
1877
1878                                 {
1879                                         Short4 value = *Pointer<Short4>(buffer + 8);
1880
1881                                         if(rgbaWriteMask != 0x0000000F)
1882                                         {
1883                                                 Short4 masked = value;
1884                                                 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1885                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1886                                                 current.w |= masked;
1887                                         }
1888
1889                                         current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1890                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1891                                         current.w |= value;
1892                                         *Pointer<Short4>(buffer + 8) = current.w;
1893                                 }
1894                         }
1895                         break;
1896                 default:
1897                         ASSERT(false);
1898                 }
1899         }
1900
1901         void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1902         {
1903                 switch(blendFactorActive)
1904                 {
1905                 case BLEND_ZERO:
1906                         // Optimized
1907                         break;
1908                 case BLEND_ONE:
1909                         // Optimized
1910                         break;
1911                 case BLEND_SOURCE:
1912                         blendFactor.x = oC.x;
1913                         blendFactor.y = oC.y;
1914                         blendFactor.z = oC.z;
1915                         break;
1916                 case BLEND_INVSOURCE:
1917                         blendFactor.x = Float4(1.0f) - oC.x;
1918                         blendFactor.y = Float4(1.0f) - oC.y;
1919                         blendFactor.z = Float4(1.0f) - oC.z;
1920                         break;
1921                 case BLEND_DEST:
1922                         blendFactor.x = pixel.x;
1923                         blendFactor.y = pixel.y;
1924                         blendFactor.z = pixel.z;
1925                         break;
1926                 case BLEND_INVDEST:
1927                         blendFactor.x = Float4(1.0f) - pixel.x;
1928                         blendFactor.y = Float4(1.0f) - pixel.y;
1929                         blendFactor.z = Float4(1.0f) - pixel.z;
1930                         break;
1931                 case BLEND_SOURCEALPHA:
1932                         blendFactor.x = oC.w;
1933                         blendFactor.y = oC.w;
1934                         blendFactor.z = oC.w;
1935                         break;
1936                 case BLEND_INVSOURCEALPHA:
1937                         blendFactor.x = Float4(1.0f) - oC.w;
1938                         blendFactor.y = Float4(1.0f) - oC.w;
1939                         blendFactor.z = Float4(1.0f) - oC.w;
1940                         break;
1941                 case BLEND_DESTALPHA:
1942                         blendFactor.x = pixel.w;
1943                         blendFactor.y = pixel.w;
1944                         blendFactor.z = pixel.w;
1945                         break;
1946                 case BLEND_INVDESTALPHA:
1947                         blendFactor.x = Float4(1.0f) - pixel.w;
1948                         blendFactor.y = Float4(1.0f) - pixel.w;
1949                         blendFactor.z = Float4(1.0f) - pixel.w;
1950                         break;
1951                 case BLEND_SRCALPHASAT:
1952                         blendFactor.x = Float4(1.0f) - pixel.w;
1953                         blendFactor.x = Min(blendFactor.x, oC.w);
1954                         blendFactor.y = blendFactor.x;
1955                         blendFactor.z = blendFactor.x;
1956                         break;
1957                 case BLEND_CONSTANT:
1958                         blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1959                         blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1960                         blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1961                         break;
1962                 case BLEND_INVCONSTANT:
1963                         blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1964                         blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1965                         blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1966                         break;
1967                 default:
1968                         ASSERT(false);
1969                 }
1970         }
1971
1972         void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1973         {
1974                 switch(blendFactorAlphaActive)
1975                 {
1976                 case BLEND_ZERO:
1977                         // Optimized
1978                         break;
1979                 case BLEND_ONE:
1980                         // Optimized
1981                         break;
1982                 case BLEND_SOURCE:
1983                         blendFactor.w = oC.w;
1984                         break;
1985                 case BLEND_INVSOURCE:
1986                         blendFactor.w = Float4(1.0f) - oC.w;
1987                         break;
1988                 case BLEND_DEST:
1989                         blendFactor.w = pixel.w;
1990                         break;
1991                 case BLEND_INVDEST:
1992                         blendFactor.w = Float4(1.0f) - pixel.w;
1993                         break;
1994                 case BLEND_SOURCEALPHA:
1995                         blendFactor.w = oC.w;
1996                         break;
1997                 case BLEND_INVSOURCEALPHA:
1998                         blendFactor.w = Float4(1.0f) - oC.w;
1999                         break;
2000                 case BLEND_DESTALPHA:
2001                         blendFactor.w = pixel.w;
2002                         break;
2003                 case BLEND_INVDESTALPHA:
2004                         blendFactor.w = Float4(1.0f) - pixel.w;
2005                         break;
2006                 case BLEND_SRCALPHASAT:
2007                         blendFactor.w = Float4(1.0f);
2008                         break;
2009                 case BLEND_CONSTANT:
2010                         blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2011                         break;
2012                 case BLEND_INVCONSTANT:
2013                         blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2014                         break;
2015                 default:
2016                         ASSERT(false);
2017                 }
2018         }
2019
2020         void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2021         {
2022                 if(!state.alphaBlendActive)
2023                 {
2024                         return;
2025                 }
2026
2027                 Pointer<Byte> buffer;
2028                 Vector4f pixel;
2029
2030                 Vector4s color;
2031                 Short4 c01;
2032                 Short4 c23;
2033
2034                 Float4 one;
2035                 if(Surface::isFloatFormat(state.targetFormat[index]))
2036                 {
2037                         one = Float4(1.0f);
2038                 }
2039                 else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2040                 {
2041                         one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2042                 }
2043
2044                 switch(state.targetFormat[index])
2045                 {
2046                 case FORMAT_R32I:
2047                 case FORMAT_R32UI:
2048                 case FORMAT_R32F:
2049                         buffer = cBuffer;
2050                         // FIXME: movlps
2051                         pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2052                         pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2053                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2054                         // FIXME: movhps
2055                         pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2056                         pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2057                         pixel.y = pixel.z = pixel.w = one;
2058                         break;
2059                 case FORMAT_G32R32I:
2060                 case FORMAT_G32R32UI:
2061                 case FORMAT_G32R32F:
2062                         buffer = cBuffer;
2063                         pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2064                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2065                         pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2066                         pixel.z = pixel.x;
2067                         pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2068                         pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2069                         pixel.y = pixel.z;
2070                         pixel.z = pixel.w = one;
2071                         break;
2072                 case FORMAT_X32B32G32R32F:
2073                 case FORMAT_A32B32G32R32F:
2074                 case FORMAT_A32B32G32R32I:
2075                 case FORMAT_A32B32G32R32UI:
2076                         buffer = cBuffer;
2077                         pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2078                         pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2079                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2080                         pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2081                         pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2082                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2083                         if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2084                         {
2085                                 pixel.w = Float4(1.0f);
2086                         }
2087                         break;
2088                 default:
2089                         ASSERT(false);
2090                 }
2091
2092                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2093                 {
2094                         sRGBtoLinear(pixel.x);
2095                         sRGBtoLinear(pixel.y);
2096                         sRGBtoLinear(pixel.z);
2097                 }
2098
2099                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2100                 Vector4f sourceFactor;
2101                 Vector4f destFactor;
2102
2103                 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2104                 blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2105
2106                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2107                 {
2108                         oC.x *= sourceFactor.x;
2109                         oC.y *= sourceFactor.y;
2110                         oC.z *= sourceFactor.z;
2111                 }
2112
2113                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2114                 {
2115                         pixel.x *= destFactor.x;
2116                         pixel.y *= destFactor.y;
2117                         pixel.z *= destFactor.z;
2118                 }
2119
2120                 switch(state.blendOperation)
2121                 {
2122                 case BLENDOP_ADD:
2123                         oC.x += pixel.x;
2124                         oC.y += pixel.y;
2125                         oC.z += pixel.z;
2126                         break;
2127                 case BLENDOP_SUB:
2128                         oC.x -= pixel.x;
2129                         oC.y -= pixel.y;
2130                         oC.z -= pixel.z;
2131                         break;
2132                 case BLENDOP_INVSUB:
2133                         oC.x = pixel.x - oC.x;
2134                         oC.y = pixel.y - oC.y;
2135                         oC.z = pixel.z - oC.z;
2136                         break;
2137                 case BLENDOP_MIN:
2138                         oC.x = Min(oC.x, pixel.x);
2139                         oC.y = Min(oC.y, pixel.y);
2140                         oC.z = Min(oC.z, pixel.z);
2141                         break;
2142                 case BLENDOP_MAX:
2143                         oC.x = Max(oC.x, pixel.x);
2144                         oC.y = Max(oC.y, pixel.y);
2145                         oC.z = Max(oC.z, pixel.z);
2146                         break;
2147                 case BLENDOP_SOURCE:
2148                         // No operation
2149                         break;
2150                 case BLENDOP_DEST:
2151                         oC.x = pixel.x;
2152                         oC.y = pixel.y;
2153                         oC.z = pixel.z;
2154                         break;
2155                 case BLENDOP_NULL:
2156                         oC.x = Float4(0.0f);
2157                         oC.y = Float4(0.0f);
2158                         oC.z = Float4(0.0f);
2159                         break;
2160                 default:
2161                         ASSERT(false);
2162                 }
2163
2164                 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2165                 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2166
2167                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2168                 {
2169                         oC.w *= sourceFactor.w;
2170                 }
2171
2172                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2173                 {
2174                         pixel.w *= destFactor.w;
2175                 }
2176
2177                 switch(state.blendOperationAlpha)
2178                 {
2179                 case BLENDOP_ADD:
2180                         oC.w += pixel.w;
2181                         break;
2182                 case BLENDOP_SUB:
2183                         oC.w -= pixel.w;
2184                         break;
2185                 case BLENDOP_INVSUB:
2186                         pixel.w -= oC.w;
2187                         oC.w = pixel.w;
2188                         break;
2189                 case BLENDOP_MIN:
2190                         oC.w = Min(oC.w, pixel.w);
2191                         break;
2192                 case BLENDOP_MAX:
2193                         oC.w = Max(oC.w, pixel.w);
2194                         break;
2195                 case BLENDOP_SOURCE:
2196                         // No operation
2197                         break;
2198                 case BLENDOP_DEST:
2199                         oC.w = pixel.w;
2200                         break;
2201                 case BLENDOP_NULL:
2202                         oC.w = Float4(0.0f);
2203                         break;
2204                 default:
2205                         ASSERT(false);
2206                 }
2207         }
2208
2209         void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2210         {
2211                 switch(state.targetFormat[index])
2212                 {
2213                 case FORMAT_R32F:
2214                 case FORMAT_R32I:
2215                 case FORMAT_R32UI:
2216                 case FORMAT_R16I:
2217                 case FORMAT_R16UI:
2218                 case FORMAT_R8I:
2219                 case FORMAT_R8UI:
2220                         break;
2221                 case FORMAT_G32R32F:
2222                 case FORMAT_G32R32I:
2223                 case FORMAT_G32R32UI:
2224                 case FORMAT_G16R16I:
2225                 case FORMAT_G16R16UI:
2226                 case FORMAT_G8R8I:
2227                 case FORMAT_G8R8UI:
2228                         oC.z = oC.x;
2229                         oC.x = UnpackLow(oC.x, oC.y);
2230                         oC.z = UnpackHigh(oC.z, oC.y);
2231                         oC.y = oC.z;
2232                         break;
2233                 case FORMAT_X32B32G32R32F:
2234                 case FORMAT_A32B32G32R32F:
2235                 case FORMAT_A32B32G32R32I:
2236                 case FORMAT_A32B32G32R32UI:
2237                 case FORMAT_A16B16G16R16I:
2238                 case FORMAT_A16B16G16R16UI:
2239                 case FORMAT_A8B8G8R8I:
2240                 case FORMAT_A8B8G8R8UI:
2241                         transpose4x4(oC.x, oC.y, oC.z, oC.w);
2242                         break;
2243                 default:
2244                         ASSERT(false);
2245                 }
2246
2247                 int rgbaWriteMask = state.colorWriteActive(index);
2248
2249                 Int xMask;   // Combination of all masks
2250
2251                 if(state.depthTestActive)
2252                 {
2253                         xMask = zMask;
2254                 }
2255                 else
2256                 {
2257                         xMask = cMask;
2258                 }
2259
2260                 if(state.stencilActive)
2261                 {
2262                         xMask &= sMask;
2263                 }
2264
2265                 Pointer<Byte> buffer;
2266                 Float4 value;
2267
2268                 switch(state.targetFormat[index])
2269                 {
2270                 case FORMAT_R32F:
2271                 case FORMAT_R32I:
2272                 case FORMAT_R32UI:
2273                         if(rgbaWriteMask & 0x00000001)
2274                         {
2275                                 buffer = cBuffer + 4 * x;
2276
2277                                 // FIXME: movlps
2278                                 value.x = *Pointer<Float>(buffer + 0);
2279                                 value.y = *Pointer<Float>(buffer + 4);
2280
2281                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2282
2283                                 // FIXME: movhps
2284                                 value.z = *Pointer<Float>(buffer + 0);
2285                                 value.w = *Pointer<Float>(buffer + 4);
2286
2287                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2288                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2289                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2290
2291                                 // FIXME: movhps
2292                                 *Pointer<Float>(buffer + 0) = oC.x.z;
2293                                 *Pointer<Float>(buffer + 4) = oC.x.w;
2294
2295                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2296
2297                                 // FIXME: movlps
2298                                 *Pointer<Float>(buffer + 0) = oC.x.x;
2299                                 *Pointer<Float>(buffer + 4) = oC.x.y;
2300                         }
2301                         break;
2302                 case FORMAT_R16I:
2303                 case FORMAT_R16UI:
2304                         if(rgbaWriteMask & 0x00000001)
2305                         {
2306                                 buffer = cBuffer + 2 * x;
2307
2308                                 UShort4 xyzw;
2309                                 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2310
2311                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2312
2313                                 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2314                                 value = As<Float4>(Int4(xyzw));
2315
2316                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2317                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2318                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2319
2320                                 if(state.targetFormat[index] == FORMAT_R16I)
2321                                 {
2322                                         Float component = oC.x.z;
2323                                         *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2324                                         component = oC.x.w;
2325                                         *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2326
2327                                         buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2328
2329                                         component = oC.x.x;
2330                                         *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2331                                         component = oC.x.y;
2332                                         *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2333                                 }
2334                                 else // FORMAT_R16UI
2335                                 {
2336                                         Float component = oC.x.z;
2337                                         *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2338                                         component = oC.x.w;
2339                                         *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2340
2341                                         buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2342
2343                                         component = oC.x.x;
2344                                         *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2345                                         component = oC.x.y;
2346                                         *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2347                                 }
2348                         }
2349                         break;
2350                 case FORMAT_R8I:
2351                 case FORMAT_R8UI:
2352                         if(rgbaWriteMask & 0x00000001)
2353                         {
2354                                 buffer = cBuffer + x;
2355
2356                                 UInt xyzw, packedCol;
2357
2358                                 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2359                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2360                                 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2361
2362                                 Short4 tmpCol = Short4(As<Int4>(oC.x));
2363                                 if(state.targetFormat[index] == FORMAT_R8I)
2364                                 {
2365                                         tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
2366                                 }
2367                                 else
2368                                 {
2369                                         tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
2370                                 }
2371                                 packedCol = Extract(As<Int2>(tmpCol), 0);
2372
2373                                 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2374                                             (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2375
2376                                 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2377                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2378                                 *Pointer<UShort>(buffer) = UShort(packedCol);
2379                         }
2380                         break;
2381                 case FORMAT_G32R32F:
2382                 case FORMAT_G32R32I:
2383                 case FORMAT_G32R32UI:
2384                         buffer = cBuffer + 8 * x;
2385
2386                         value = *Pointer<Float4>(buffer);
2387
2388                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2389                         {
2390                                 Float4 masked = value;
2391                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2392                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2393                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2394                         }
2395
2396                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2397                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2398                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2399                         *Pointer<Float4>(buffer) = oC.x;
2400
2401                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2402
2403                         value = *Pointer<Float4>(buffer);
2404
2405                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2406                         {
2407                                 Float4 masked;
2408
2409                                 masked = value;
2410                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2411                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2412                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2413                         }
2414
2415                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2416                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2417                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2418                         *Pointer<Float4>(buffer) = oC.y;
2419                         break;
2420                 case FORMAT_G16R16I:
2421                 case FORMAT_G16R16UI:
2422                         if((rgbaWriteMask & 0x00000003) != 0x0)
2423                         {
2424                                 buffer = cBuffer + 4 * x;
2425
2426                                 UInt2 rgbaMask;
2427                                 UShort4 packedCol = UShort4(As<Int4>(oC.x));
2428                                 UShort4 value = *Pointer<UShort4>(buffer);
2429                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2430                                 if((rgbaWriteMask & 0x3) != 0x3)
2431                                 {
2432                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2433                                         rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2434                                         mergedMask &= rgbaMask;
2435                                 }
2436                                 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2437
2438                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2439
2440                                 packedCol = UShort4(As<Int4>(oC.y));
2441                                 value = *Pointer<UShort4>(buffer);
2442                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2443                                 if((rgbaWriteMask & 0x3) != 0x3)
2444                                 {
2445                                         mergedMask &= rgbaMask;
2446                                 }
2447                                 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2448                         }
2449                         break;
2450                 case FORMAT_G8R8I:
2451                 case FORMAT_G8R8UI:
2452                         if((rgbaWriteMask & 0x00000003) != 0x0)
2453                         {
2454                                 buffer = cBuffer + 2 * x;
2455
2456                                 Int2 xyzw, packedCol;
2457
2458                                 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2459                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2460                                 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2461
2462                                 if(state.targetFormat[index] == FORMAT_G8R8I)
2463                                 {
2464                                         packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2465                                 }
2466                                 else
2467                                 {
2468                                         packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2469                                 }
2470
2471                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2472                                 if((rgbaWriteMask & 0x3) != 0x3)
2473                                 {
2474                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2475                                         UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2476                                         mergedMask &= rgbaMask;
2477                                 }
2478
2479                                 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2480
2481                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2482                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2483                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2484                         }
2485                         break;
2486                 case FORMAT_X32B32G32R32F:
2487                 case FORMAT_A32B32G32R32F:
2488                 case FORMAT_A32B32G32R32I:
2489                 case FORMAT_A32B32G32R32UI:
2490                         buffer = cBuffer + 16 * x;
2491
2492                         {
2493                                 value = *Pointer<Float4>(buffer, 16);
2494
2495                                 if(rgbaWriteMask != 0x0000000F)
2496                                 {
2497                                         Float4 masked = value;
2498                                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2499                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2500                                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2501                                 }
2502
2503                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2504                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2505                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2506                                 *Pointer<Float4>(buffer, 16) = oC.x;
2507                         }
2508
2509                         {
2510                                 value = *Pointer<Float4>(buffer + 16, 16);
2511
2512                                 if(rgbaWriteMask != 0x0000000F)
2513                                 {
2514                                         Float4 masked = value;
2515                                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2516                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2517                                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2518                                 }
2519
2520                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2521                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2522                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2523                                 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2524                         }
2525
2526                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2527
2528                         {
2529                                 value = *Pointer<Float4>(buffer, 16);
2530
2531                                 if(rgbaWriteMask != 0x0000000F)
2532                                 {
2533                                         Float4 masked = value;
2534                                         oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2535                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2536                                         oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2537                                 }
2538
2539                                 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2540                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2541                                 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2542                                 *Pointer<Float4>(buffer, 16) = oC.z;
2543                         }
2544
2545                         {
2546                                 value = (state.targetFormat[index] == FORMAT_X32B32G32R32F) ? Float4(1.0f) : *Pointer<Float4>(buffer + 16, 16);
2547
2548                                 if(rgbaWriteMask != 0x0000000F)
2549                                 {
2550                                         Float4 masked = value;
2551                                         oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2552                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2553                                         oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2554                                 }
2555
2556                                 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2557                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2558                                 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2559                                 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2560                         }
2561                         break;
2562                 case FORMAT_A16B16G16R16I:
2563                 case FORMAT_A16B16G16R16UI:
2564                         if((rgbaWriteMask & 0x0000000F) != 0x0)
2565                         {
2566                                 buffer = cBuffer + 8 * x;
2567
2568                                 UInt4 rgbaMask;
2569                                 UShort8 value = *Pointer<UShort8>(buffer);
2570                                 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2571                                 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2572                                 if((rgbaWriteMask & 0xF) != 0xF)
2573                                 {
2574                                         UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2575                                         rgbaMask = UInt4(tmpMask, tmpMask);
2576                                         mergedMask &= rgbaMask;
2577                                 }
2578                                 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2579
2580                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2581
2582                                 value = *Pointer<UShort8>(buffer);
2583                                 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2584                                 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2585                                 if((rgbaWriteMask & 0xF) != 0xF)
2586                                 {
2587                                         mergedMask &= rgbaMask;
2588                                 }
2589                                 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2590                         }
2591                         break;
2592                 case FORMAT_A8B8G8R8I:
2593                 case FORMAT_A8B8G8R8UI:
2594                         if((rgbaWriteMask & 0x0000000F) != 0x0)
2595                         {
2596                                 UInt2 value, packedCol, mergedMask;
2597
2598                                 buffer = cBuffer + 4 * x;
2599
2600                                 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2601                                 {
2602                                         packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2603                                 }
2604                                 else
2605                                 {
2606                                         packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2607                                 }
2608                                 value = *Pointer<UInt2>(buffer, 16);
2609                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2610                                 if(rgbaWriteMask != 0xF)
2611                                 {
2612                                         mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2613                                 }
2614                                 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2615
2616                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2617
2618                                 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2619                                 {
2620                                         packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2621                                 }
2622                                 else
2623                                 {
2624                                         packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
2625                                 }
2626                                 value = *Pointer<UInt2>(buffer, 16);
2627                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2628                                 if(rgbaWriteMask != 0xF)
2629                                 {
2630                                         mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2631                                 }
2632                                 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2633                         }
2634                         break;
2635                 default:
2636                         ASSERT(false);
2637                 }
2638         }
2639
2640         UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2641         {
2642                 return UShort4(cf * Float4(0xFFFF), saturate);
2643         }
2644
2645         void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2646         {
2647                 c.x = As<UShort4>(c.x) >> 4;
2648                 c.y = As<UShort4>(c.y) >> 4;
2649                 c.z = As<UShort4>(c.z) >> 4;
2650
2651                 sRGBtoLinear12_16(c);
2652         }
2653
2654         void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2655         {
2656                 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2657
2658                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2659                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2660                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2661                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2662
2663                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2664                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2665                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2666                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2667
2668                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2669                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2670                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2671                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2672         }
2673
2674         void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2675         {
2676                 c.x = As<UShort4>(c.x) >> 4;
2677                 c.y = As<UShort4>(c.y) >> 4;
2678                 c.z = As<UShort4>(c.z) >> 4;
2679
2680                 linearToSRGB12_16(c);
2681         }
2682
2683         void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2684         {
2685                 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2686
2687                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2688                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2689                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2690                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2691
2692                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2693                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2694                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2695                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2696
2697                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2698                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2699                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2700                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2701         }
2702
2703         Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2704         {
2705                 Float4 linear = x * x;
2706                 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2707
2708                 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2709         }
2710
2711         bool PixelRoutine::colorUsed()
2712         {
2713                 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2714         }
2715 }