OSDN Git Service

Added RG8 fragment output format
[android-x86/external-swiftshader.git] / src / Shader / PixelRoutine.cpp
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelRoutine.hpp"
16
17 #include "Renderer.hpp"
18 #include "QuadRasterizer.hpp"
19 #include "Surface.hpp"
20 #include "Primitive.hpp"
21 #include "CPUID.hpp"
22 #include "SamplerCore.hpp"
23 #include "Constants.hpp"
24 #include "Debug.hpp"
25
26 namespace sw
27 {
28         extern bool complementaryDepthBuffer;
29         extern bool postBlendSRGB;
30         extern bool exactColorRounding;
31         extern bool forceClearRegisters;
32
33         PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
34         {
35                 if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
36                 {
37                         for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
38                         {
39                                 v[i].x = Float4(0.0f);
40                                 v[i].y = Float4(0.0f);
41                                 v[i].z = Float4(0.0f);
42                                 v[i].w = Float4(0.0f);
43                         }
44                 }
45         }
46
47         PixelRoutine::~PixelRoutine()
48         {
49                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
50                 {
51                         delete sampler[i];
52                 }
53         }
54
55         void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
56         {
57                 #if PERF_PROFILE
58                         Long pipeTime = Ticks();
59                 #endif
60
61                 for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
62                 {
63                         sampler[i] = new SamplerCore(constants, state.sampler[i]);
64                 }
65
66                 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
67
68                 Int zMask[4];   // Depth mask
69                 Int sMask[4];   // Stencil mask
70
71                 for(unsigned int q = 0; q < state.multiSample; q++)
72                 {
73                         zMask[q] = cMask[q];
74                         sMask[q] = cMask[q];
75                 }
76
77                 for(unsigned int q = 0; q < state.multiSample; q++)
78                 {
79                         stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
80                 }
81
82                 Float4 f;
83                 Float4 rhwCentroid;
84
85                 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
86
87                 if(interpolateZ())
88                 {
89                         for(unsigned int q = 0; q < state.multiSample; q++)
90                         {
91                                 Float4 x = xxxx;
92
93                                 if(state.multiSample > 1)
94                                 {
95                                         x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
96                                 }
97
98                                 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
99                         }
100                 }
101
102                 Bool depthPass = false;
103
104                 if(earlyDepthTest)
105                 {
106                         for(unsigned int q = 0; q < state.multiSample; q++)
107                         {
108                                 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
109                         }
110                 }
111
112                 If(depthPass || Bool(!earlyDepthTest))
113                 {
114                         #if PERF_PROFILE
115                                 Long interpTime = Ticks();
116                         #endif
117
118                         Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
119
120                         // Centroid locations
121                         Float4 XXXX = Float4(0.0f);
122                         Float4 YYYY = Float4(0.0f);
123
124                         if(state.centroid)
125                         {
126                                 Float4 WWWW(1.0e-9f);
127
128                                 for(unsigned int q = 0; q < state.multiSample; q++)
129                                 {
130                                         XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
131                                         YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
132                                         WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
133                                 }
134
135                                 WWWW = Rcp_pp(WWWW);
136                                 XXXX *= WWWW;
137                                 YYYY *= WWWW;
138
139                                 XXXX += xxxx;
140                                 YYYY += yyyy;
141                         }
142
143                         if(interpolateW())
144                         {
145                                 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
146                                 rhw = reciprocal(w, false, false, true);
147
148                                 if(state.centroid)
149                                 {
150                                         rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
151                                 }
152                         }
153
154                         for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
155                         {
156                                 for(int component = 0; component < 4; component++)
157                                 {
158                                         if(state.interpolant[interpolant].component & (1 << component))
159                                         {
160                                                 if(!state.interpolant[interpolant].centroid)
161                                                 {
162                                                         v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
163                                                 }
164                                                 else
165                                                 {
166                                                         v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
167                                                 }
168                                         }
169                                 }
170
171                                 Float4 rcp;
172
173                                 switch(state.interpolant[interpolant].project)
174                                 {
175                                 case 0:
176                                         break;
177                                 case 1:
178                                         rcp = reciprocal(v[interpolant].y);
179                                         v[interpolant].x = v[interpolant].x * rcp;
180                                         break;
181                                 case 2:
182                                         rcp = reciprocal(v[interpolant].z);
183                                         v[interpolant].x = v[interpolant].x * rcp;
184                                         v[interpolant].y = v[interpolant].y * rcp;
185                                         break;
186                                 case 3:
187                                         rcp = reciprocal(v[interpolant].w);
188                                         v[interpolant].x = v[interpolant].x * rcp;
189                                         v[interpolant].y = v[interpolant].y * rcp;
190                                         v[interpolant].z = v[interpolant].z * rcp;
191                                         break;
192                                 }
193                         }
194
195                         if(state.fog.component)
196                         {
197                                 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
198                         }
199
200                         setBuiltins(x, y, z, w);
201
202                         #if PERF_PROFILE
203                                 cycles[PERF_INTERP] += Ticks() - interpTime;
204                         #endif
205
206                         Bool alphaPass = true;
207
208                         if(colorUsed())
209                         {
210                                 #if PERF_PROFILE
211                                         Long shaderTime = Ticks();
212                                 #endif
213
214                                 applyShader(cMask);
215
216                                 #if PERF_PROFILE
217                                         cycles[PERF_SHADER] += Ticks() - shaderTime;
218                                 #endif
219
220                                 alphaPass = alphaTest(cMask);
221
222                                 if((shader && shader->containsKill()) || state.alphaTestActive())
223                                 {
224                                         for(unsigned int q = 0; q < state.multiSample; q++)
225                                         {
226                                                 zMask[q] &= cMask[q];
227                                                 sMask[q] &= cMask[q];
228                                         }
229                                 }
230                         }
231
232                         If(alphaPass)
233                         {
234                                 if(!earlyDepthTest)
235                                 {
236                                         for(unsigned int q = 0; q < state.multiSample; q++)
237                                         {
238                                                 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
239                                         }
240                                 }
241
242                                 #if PERF_PROFILE
243                                         Long ropTime = Ticks();
244                                 #endif
245
246                                 If(depthPass || Bool(earlyDepthTest))
247                                 {
248                                         for(unsigned int q = 0; q < state.multiSample; q++)
249                                         {
250                                                 if(state.multiSampleMask & (1 << q))
251                                                 {
252                                                         writeDepth(zBuffer, q, x, z[q], zMask[q]);
253
254                                                         if(state.occlusionEnabled)
255                                                         {
256                                                                 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
257                                                         }
258                                                 }
259                                         }
260
261                                         if(colorUsed())
262                                         {
263                                                 #if PERF_PROFILE
264                                                         AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
265                                                 #endif
266
267                                                 rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
268                                         }
269                                 }
270
271                                 #if PERF_PROFILE
272                                         cycles[PERF_ROP] += Ticks() - ropTime;
273                                 #endif
274                         }
275                 }
276
277                 for(unsigned int q = 0; q < state.multiSample; q++)
278                 {
279                         if(state.multiSampleMask & (1 << q))
280                         {
281                                 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
282                         }
283                 }
284
285                 #if PERF_PROFILE
286                         cycles[PERF_PIPE] += Ticks() - pipeTime;
287                 #endif
288         }
289
290         Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
291         {
292                 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
293
294                 if(!flat)
295                 {
296                         interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
297                                        y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
298
299                         if(perspective)
300                         {
301                                 interpolant *= rhw;
302                         }
303                 }
304
305                 return interpolant;
306         }
307
308         void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
309         {
310                 if(!state.stencilActive)
311                 {
312                         return;
313                 }
314
315                 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
316
317                 Pointer<Byte> buffer = sBuffer + 2 * x;
318
319                 if(q > 0)
320                 {
321                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
322                 }
323
324                 Byte8 value = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
325                 Byte8 valueCCW = value;
326
327                 if(!state.noStencilMask)
328                 {
329                         value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
330                 }
331
332                 stencilTest(value, state.stencilCompareMode, false);
333
334                 if(state.twoSidedStencil)
335                 {
336                         if(!state.noStencilMaskCCW)
337                         {
338                                 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
339                         }
340
341                         stencilTest(valueCCW, state.stencilCompareModeCCW, true);
342
343                         value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
344                         valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
345                         value |= valueCCW;
346                 }
347
348                 sMask = SignMask(value) & cMask;
349         }
350
351         void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
352         {
353                 Byte8 equal;
354
355                 switch(stencilCompareMode)
356                 {
357                 case STENCIL_ALWAYS:
358                         value = Byte8(0xFFFFFFFFFFFFFFFF);
359                         break;
360                 case STENCIL_NEVER:
361                         value = Byte8(0x0000000000000000);
362                         break;
363                 case STENCIL_LESS:                      // a < b ~ b > a
364                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
365                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
366                         break;
367                 case STENCIL_EQUAL:
368                         value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
369                         break;
370                 case STENCIL_NOTEQUAL:          // a != b ~ !(a == b)
371                         value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
372                         value ^= Byte8(0xFFFFFFFFFFFFFFFF);
373                         break;
374                 case STENCIL_LESSEQUAL: // a <= b ~ (b > a) || (a == b)
375                         equal = value;
376                         equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
377                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
378                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
379                         value |= equal;
380                         break;
381                 case STENCIL_GREATER:           // a > b
382                         equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
383                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
384                         equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
385                         value = equal;
386                         break;
387                 case STENCIL_GREATEREQUAL:      // a >= b ~ !(a < b) ~ !(b > a)
388                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
389                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
390                         value ^= Byte8(0xFFFFFFFFFFFFFFFF);
391                         break;
392                 default:
393                         ASSERT(false);
394                 }
395         }
396
397         Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
398         {
399                 if(!state.depthTestActive)
400                 {
401                         return true;
402                 }
403
404                 Float4 Z = z;
405
406                 if(shader && shader->depthOverride())
407                 {
408                         if(complementaryDepthBuffer)
409                         {
410                                 Z = Float4(1.0f) - oDepth;
411                         }
412                         else
413                         {
414                                 Z = oDepth;
415                         }
416                 }
417
418                 Pointer<Byte> buffer;
419                 Int pitch;
420
421                 if(!state.quadLayoutDepthBuffer)
422                 {
423                         buffer = zBuffer + 4 * x;
424                         pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
425                 }
426                 else
427                 {
428                         buffer = zBuffer + 8 * x;
429                 }
430
431                 if(q > 0)
432                 {
433                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
434                 }
435
436                 Float4 zValue;
437
438                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
439                 {
440                         if(!state.quadLayoutDepthBuffer)
441                         {
442                                 // FIXME: Properly optimizes?
443                                 zValue.xy = *Pointer<Float4>(buffer);
444                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
445                         }
446                         else
447                         {
448                                 zValue = *Pointer<Float4>(buffer, 16);
449                         }
450                 }
451
452                 Int4 zTest;
453
454                 switch(state.depthCompareMode)
455                 {
456                 case DEPTH_ALWAYS:
457                         // Optimized
458                         break;
459                 case DEPTH_NEVER:
460                         // Optimized
461                         break;
462                 case DEPTH_EQUAL:
463                         zTest = CmpEQ(zValue, Z);
464                         break;
465                 case DEPTH_NOTEQUAL:
466                         zTest = CmpNEQ(zValue, Z);
467                         break;
468                 case DEPTH_LESS:
469                         if(complementaryDepthBuffer)
470                         {
471                                 zTest = CmpLT(zValue, Z);
472                         }
473                         else
474                         {
475                                 zTest = CmpNLE(zValue, Z);
476                         }
477                         break;
478                 case DEPTH_GREATEREQUAL:
479                         if(complementaryDepthBuffer)
480                         {
481                                 zTest = CmpNLT(zValue, Z);
482                         }
483                         else
484                         {
485                                 zTest = CmpLE(zValue, Z);
486                         }
487                         break;
488                 case DEPTH_LESSEQUAL:
489                         if(complementaryDepthBuffer)
490                         {
491                                 zTest = CmpLE(zValue, Z);
492                         }
493                         else
494                         {
495                                 zTest = CmpNLT(zValue, Z);
496                         }
497                         break;
498                 case DEPTH_GREATER:
499                         if(complementaryDepthBuffer)
500                         {
501                                 zTest = CmpNLE(zValue, Z);
502                         }
503                         else
504                         {
505                                 zTest = CmpLT(zValue, Z);
506                         }
507                         break;
508                 default:
509                         ASSERT(false);
510                 }
511
512                 switch(state.depthCompareMode)
513                 {
514                 case DEPTH_ALWAYS:
515                         zMask = cMask;
516                         break;
517                 case DEPTH_NEVER:
518                         zMask = 0x0;
519                         break;
520                 default:
521                         zMask = SignMask(zTest) & cMask;
522                         break;
523                 }
524
525                 if(state.stencilActive)
526                 {
527                         zMask &= sMask;
528                 }
529
530                 return zMask != 0;
531         }
532
533         void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
534         {
535                 Short4 cmp;
536                 Short4 equal;
537
538                 switch(state.alphaCompareMode)
539                 {
540                 case ALPHA_ALWAYS:
541                         aMask = 0xF;
542                         break;
543                 case ALPHA_NEVER:
544                         aMask = 0x0;
545                         break;
546                 case ALPHA_EQUAL:
547                         cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
548                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
549                         break;
550                 case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
551                         cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
552                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
553                         break;
554                 case ALPHA_LESS:           // a < b ~ b > a
555                         cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
556                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
557                         break;
558                 case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
559                         equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
560                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
561                         cmp |= equal;
562                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
563                         break;
564                 case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
565                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
566                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
567                         break;
568                 case ALPHA_GREATER:        // a > b
569                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
570                         aMask = SignMask(Pack(cmp, Short4(0x0000)));
571                         break;
572                 default:
573                         ASSERT(false);
574                 }
575         }
576
577         void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
578         {
579                 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
580                 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
581                 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
582                 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
583
584                 Int aMask0 = SignMask(coverage0);
585                 Int aMask1 = SignMask(coverage1);
586                 Int aMask2 = SignMask(coverage2);
587                 Int aMask3 = SignMask(coverage3);
588
589                 cMask[0] &= aMask0;
590                 cMask[1] &= aMask1;
591                 cMask[2] &= aMask2;
592                 cMask[3] &= aMask3;
593         }
594
595         void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
596         {
597                 if(!state.fogActive)
598                 {
599                         return;
600                 }
601
602                 if(state.pixelFogMode != FOG_NONE)
603                 {
604                         pixelFog(fog);
605
606                         fog = Min(fog, Float4(1.0f));
607                         fog = Max(fog, Float4(0.0f));
608                 }
609
610                 c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
611                 c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
612                 c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
613
614                 c0.x *= fog;
615                 c0.y *= fog;
616                 c0.z *= fog;
617
618                 c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
619                 c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
620                 c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
621         }
622
623         void PixelRoutine::pixelFog(Float4 &visibility)
624         {
625                 Float4 &zw = visibility;
626
627                 if(state.pixelFogMode != FOG_NONE)
628                 {
629                         if(state.wBasedFog)
630                         {
631                                 zw = rhw;
632                         }
633                         else
634                         {
635                                 if(complementaryDepthBuffer)
636                                 {
637                                         zw = Float4(1.0f) - z[0];
638                                 }
639                                 else
640                                 {
641                                         zw = z[0];
642                                 }
643                         }
644                 }
645
646                 switch(state.pixelFogMode)
647                 {
648                 case FOG_NONE:
649                         break;
650                 case FOG_LINEAR:
651                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
652                         zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
653                         break;
654                 case FOG_EXP:
655                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
656                         zw = exponential2(zw, true);
657                         break;
658                 case FOG_EXP2:
659                         zw *= zw;
660                         zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
661                         zw = exponential2(zw, true);
662                         break;
663                 default:
664                         ASSERT(false);
665                 }
666         }
667
668         void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
669         {
670                 if(!state.depthWriteEnable)
671                 {
672                         return;
673                 }
674
675                 Float4 Z = z;
676
677                 if(shader && shader->depthOverride())
678                 {
679                         if(complementaryDepthBuffer)
680                         {
681                                 Z = Float4(1.0f) - oDepth;
682                         }
683                         else
684                         {
685                                 Z = oDepth;
686                         }
687                 }
688
689                 Pointer<Byte> buffer;
690                 Int pitch;
691
692                 if(!state.quadLayoutDepthBuffer)
693                 {
694                         buffer = zBuffer + 4 * x;
695                         pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
696                 }
697                 else
698                 {
699                         buffer = zBuffer + 8 * x;
700                 }
701
702                 if(q > 0)
703                 {
704                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
705                 }
706
707                 Float4 zValue;
708
709                 if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
710                 {
711                         if(!state.quadLayoutDepthBuffer)
712                         {
713                                 // FIXME: Properly optimizes?
714                                 zValue.xy = *Pointer<Float4>(buffer);
715                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
716                         }
717                         else
718                         {
719                                 zValue = *Pointer<Float4>(buffer, 16);
720                         }
721                 }
722
723                 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
724                 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
725                 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
726
727                 if(!state.quadLayoutDepthBuffer)
728                 {
729                         // FIXME: Properly optimizes?
730                         *Pointer<Float2>(buffer) = Float2(Z.xy);
731                         *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
732                 }
733                 else
734                 {
735                         *Pointer<Float4>(buffer, 16) = Z;
736                 }
737         }
738
739         void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
740         {
741                 if(!state.stencilActive)
742                 {
743                         return;
744                 }
745
746                 if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
747                 {
748                         if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
749                         {
750                                 return;
751                         }
752                 }
753
754                 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
755                 {
756                         return;
757                 }
758
759                 Pointer<Byte> buffer = sBuffer + 2 * x;
760
761                 if(q > 0)
762                 {
763                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
764                 }
765
766                 Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
767
768                 Byte8 newValue;
769                 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
770
771                 if(!state.noStencilWriteMask)
772                 {
773                         Byte8 maskedValue = bufferValue;
774                         newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
775                         maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
776                         newValue |= maskedValue;
777                 }
778
779                 if(state.twoSidedStencil)
780                 {
781                         Byte8 newValueCCW;
782
783                         stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
784
785                         if(!state.noStencilWriteMaskCCW)
786                         {
787                                 Byte8 maskedValue = bufferValue;
788                                 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
789                                 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
790                                 newValueCCW |= maskedValue;
791                         }
792
793                         newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
794                         newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
795                         newValue |= newValueCCW;
796                 }
797
798                 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
799                 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
800                 newValue |= bufferValue;
801
802                 *Pointer<UInt>(buffer) = UInt(As<Long>(newValue));
803         }
804
805         void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
806         {
807                 Byte8 &pass = newValue;
808                 Byte8 fail;
809                 Byte8 zFail;
810
811                 stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
812
813                 if(stencilZFailOperation != stencilPassOperation)
814                 {
815                         stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
816                 }
817
818                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
819                 {
820                         stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
821                 }
822
823                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
824                 {
825                         if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
826                         {
827                                 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
828                                 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
829                                 pass |= zFail;
830                         }
831
832                         pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
833                         fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
834                         pass |= fail;
835                 }
836         }
837
838         void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
839         {
840                 switch(operation)
841                 {
842                 case OPERATION_KEEP:
843                         output = bufferValue;
844                         break;
845                 case OPERATION_ZERO:
846                         output = Byte8(0x0000000000000000);
847                         break;
848                 case OPERATION_REPLACE:
849                         output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
850                         break;
851                 case OPERATION_INCRSAT:
852                         output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
853                         break;
854                 case OPERATION_DECRSAT:
855                         output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
856                         break;
857                 case OPERATION_INVERT:
858                         output = bufferValue ^ Byte8(0xFFFFFFFFFFFFFFFF);
859                         break;
860                 case OPERATION_INCR:
861                         output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
862                         break;
863                 case OPERATION_DECR:
864                         output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
865                         break;
866                 default:
867                         ASSERT(false);
868                 }
869         }
870
871         void PixelRoutine::blendFactor(const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
872         {
873                 switch(blendFactorActive)
874                 {
875                 case BLEND_ZERO:
876                         // Optimized
877                         break;
878                 case BLEND_ONE:
879                         // Optimized
880                         break;
881                 case BLEND_SOURCE:
882                         blendFactor.x = current.x;
883                         blendFactor.y = current.y;
884                         blendFactor.z = current.z;
885                         break;
886                 case BLEND_INVSOURCE:
887                         blendFactor.x = Short4(0xFFFFu) - current.x;
888                         blendFactor.y = Short4(0xFFFFu) - current.y;
889                         blendFactor.z = Short4(0xFFFFu) - current.z;
890                         break;
891                 case BLEND_DEST:
892                         blendFactor.x = pixel.x;
893                         blendFactor.y = pixel.y;
894                         blendFactor.z = pixel.z;
895                         break;
896                 case BLEND_INVDEST:
897                         blendFactor.x = Short4(0xFFFFu) - pixel.x;
898                         blendFactor.y = Short4(0xFFFFu) - pixel.y;
899                         blendFactor.z = Short4(0xFFFFu) - pixel.z;
900                         break;
901                 case BLEND_SOURCEALPHA:
902                         blendFactor.x = current.w;
903                         blendFactor.y = current.w;
904                         blendFactor.z = current.w;
905                         break;
906                 case BLEND_INVSOURCEALPHA:
907                         blendFactor.x = Short4(0xFFFFu) - current.w;
908                         blendFactor.y = Short4(0xFFFFu) - current.w;
909                         blendFactor.z = Short4(0xFFFFu) - current.w;
910                         break;
911                 case BLEND_DESTALPHA:
912                         blendFactor.x = pixel.w;
913                         blendFactor.y = pixel.w;
914                         blendFactor.z = pixel.w;
915                         break;
916                 case BLEND_INVDESTALPHA:
917                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
918                         blendFactor.y = Short4(0xFFFFu) - pixel.w;
919                         blendFactor.z = Short4(0xFFFFu) - pixel.w;
920                         break;
921                 case BLEND_SRCALPHASAT:
922                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
923                         blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
924                         blendFactor.y = blendFactor.x;
925                         blendFactor.z = blendFactor.x;
926                         break;
927                 case BLEND_CONSTANT:
928                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
929                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
930                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
931                         break;
932                 case BLEND_INVCONSTANT:
933                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
934                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
935                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
936                         break;
937                 case BLEND_CONSTANTALPHA:
938                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
940                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
941                         break;
942                 case BLEND_INVCONSTANTALPHA:
943                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
944                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
945                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
946                         break;
947                 default:
948                         ASSERT(false);
949                 }
950         }
951
952         void PixelRoutine::blendFactorAlpha(const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
953         {
954                 switch(blendFactorAlphaActive)
955                 {
956                 case BLEND_ZERO:
957                         // Optimized
958                         break;
959                 case BLEND_ONE:
960                         // Optimized
961                         break;
962                 case BLEND_SOURCE:
963                         blendFactor.w = current.w;
964                         break;
965                 case BLEND_INVSOURCE:
966                         blendFactor.w = Short4(0xFFFFu) - current.w;
967                         break;
968                 case BLEND_DEST:
969                         blendFactor.w = pixel.w;
970                         break;
971                 case BLEND_INVDEST:
972                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
973                         break;
974                 case BLEND_SOURCEALPHA:
975                         blendFactor.w = current.w;
976                         break;
977                 case BLEND_INVSOURCEALPHA:
978                         blendFactor.w = Short4(0xFFFFu) - current.w;
979                         break;
980                 case BLEND_DESTALPHA:
981                         blendFactor.w = pixel.w;
982                         break;
983                 case BLEND_INVDESTALPHA:
984                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
985                         break;
986                 case BLEND_SRCALPHASAT:
987                         blendFactor.w = Short4(0xFFFFu);
988                         break;
989                 case BLEND_CONSTANT:
990                 case BLEND_CONSTANTALPHA:
991                         blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
992                         break;
993                 case BLEND_INVCONSTANT:
994                 case BLEND_INVCONSTANTALPHA:
995                         blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
996                         break;
997                 default:
998                         ASSERT(false);
999                 }
1000         }
1001
1002         bool PixelRoutine::isSRGB(int index) const
1003         {
1004                 return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
1005         }
1006
1007         void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1008         {
1009                 Short4 c01;
1010                 Short4 c23;
1011                 Pointer<Byte> buffer;
1012                 Pointer<Byte> buffer2;
1013
1014                 switch(state.targetFormat[index])
1015                 {
1016                 case FORMAT_R5G6B5:
1017                         buffer = cBuffer + 2 * x;
1018                         buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1019                         c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1020
1021                         pixel.x = c01 & Short4(0xF800u);
1022                         pixel.y = (c01 & Short4(0x07E0u)) << 5;
1023                         pixel.z = (c01 & Short4(0x001Fu)) << 11;
1024                         pixel.w = Short4(0xFFFFu);
1025                         break;
1026                 case FORMAT_A8R8G8B8:
1027                         buffer = cBuffer + 4 * x;
1028                         c01 = *Pointer<Short4>(buffer);
1029                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1030                         c23 = *Pointer<Short4>(buffer);
1031                         pixel.z = c01;
1032                         pixel.y = c01;
1033                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1034                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1035                         pixel.x = pixel.z;
1036                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1037                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1038                         pixel.y = pixel.z;
1039                         pixel.w = pixel.x;
1040                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1041                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1042                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1043                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1044                         break;
1045                 case FORMAT_A8B8G8R8:
1046                 case FORMAT_SRGB8_A8:
1047                         buffer = cBuffer + 4 * x;
1048                         c01 = *Pointer<Short4>(buffer);
1049                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1050                         c23 = *Pointer<Short4>(buffer);
1051                         pixel.z = c01;
1052                         pixel.y = c01;
1053                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1054                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1055                         pixel.x = pixel.z;
1056                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1057                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1058                         pixel.y = pixel.z;
1059                         pixel.w = pixel.x;
1060                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1061                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1062                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1063                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1064                         break;
1065                 case FORMAT_A8:
1066                         buffer = cBuffer + 1 * x;
1067                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1068                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1069                         pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1070                         pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1071                         pixel.x = Short4(0x0000);
1072                         pixel.y = Short4(0x0000);
1073                         pixel.z = Short4(0x0000);
1074                         break;
1075                 case FORMAT_X8R8G8B8:
1076                         buffer = cBuffer + 4 * x;
1077                         c01 = *Pointer<Short4>(buffer);
1078                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1079                         c23 = *Pointer<Short4>(buffer);
1080                         pixel.z = c01;
1081                         pixel.y = c01;
1082                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1083                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1084                         pixel.x = pixel.z;
1085                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1086                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1087                         pixel.y = pixel.z;
1088                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1089                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1090                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1091                         pixel.w = Short4(0xFFFFu);
1092                         break;
1093                 case FORMAT_X8B8G8R8:
1094                 case FORMAT_SRGB8_X8:
1095                         buffer = cBuffer + 4 * x;
1096                         c01 = *Pointer<Short4>(buffer);
1097                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1098                         c23 = *Pointer<Short4>(buffer);
1099                         pixel.z = c01;
1100                         pixel.y = c01;
1101                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1102                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1103                         pixel.x = pixel.z;
1104                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1105                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1106                         pixel.y = pixel.z;
1107                         pixel.w = pixel.x;
1108                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1109                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1110                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1111                         pixel.w = Short4(0xFFFFu);
1112                         break;
1113                 case FORMAT_A8G8R8B8Q:
1114                         UNIMPLEMENTED();
1115                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1116                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1117                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1118                 //      pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1119                         break;
1120                 case FORMAT_X8G8R8B8Q:
1121                         UNIMPLEMENTED();
1122                 //      pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1123                 //      pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1124                 //      pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1125                 //      pixel.w = Short4(0xFFFFu);
1126                         break;
1127                 case FORMAT_A16B16G16R16:
1128                         buffer = cBuffer;
1129                         pixel.x = *Pointer<Short4>(buffer + 8 * x);
1130                         pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1131                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1132                         pixel.z = *Pointer<Short4>(buffer + 8 * x);
1133                         pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1134                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1135                         break;
1136                 case FORMAT_G16R16:
1137                         buffer = cBuffer;
1138                         pixel.x = *Pointer<Short4>(buffer + 4 * x);
1139                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1140                         pixel.y = *Pointer<Short4>(buffer + 4 * x);
1141                         pixel.z = pixel.x;
1142                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1143                         pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1144                         pixel.y = pixel.z;
1145                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1146                         pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1147                         pixel.z = Short4(0xFFFFu);
1148                         pixel.w = Short4(0xFFFFu);
1149                         break;
1150                 default:
1151                         ASSERT(false);
1152                 }
1153
1154                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1155                 {
1156                         sRGBtoLinear16_12_16(pixel);
1157                 }
1158         }
1159
1160         void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1161         {
1162                 if(!state.alphaBlendActive)
1163                 {
1164                         return;
1165                 }
1166
1167                 Vector4s pixel;
1168                 readPixel(index, cBuffer, x, pixel);
1169
1170                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1171                 Vector4s sourceFactor;
1172                 Vector4s destFactor;
1173
1174                 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1175                 blendFactor(destFactor, current, pixel, state.destBlendFactor);
1176
1177                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1178                 {
1179                         current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1180                         current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1181                         current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1182                 }
1183
1184                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1185                 {
1186                         pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1187                         pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1188                         pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1189                 }
1190
1191                 switch(state.blendOperation)
1192                 {
1193                 case BLENDOP_ADD:
1194                         current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1195                         current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1196                         current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1197                         break;
1198                 case BLENDOP_SUB:
1199                         current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1200                         current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1201                         current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1202                         break;
1203                 case BLENDOP_INVSUB:
1204                         current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1205                         current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1206                         current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1207                         break;
1208                 case BLENDOP_MIN:
1209                         current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1210                         current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1211                         current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1212                         break;
1213                 case BLENDOP_MAX:
1214                         current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1215                         current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1216                         current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1217                         break;
1218                 case BLENDOP_SOURCE:
1219                         // No operation
1220                         break;
1221                 case BLENDOP_DEST:
1222                         current.x = pixel.x;
1223                         current.y = pixel.y;
1224                         current.z = pixel.z;
1225                         break;
1226                 case BLENDOP_NULL:
1227                         current.x = Short4(0x0000);
1228                         current.y = Short4(0x0000);
1229                         current.z = Short4(0x0000);
1230                         break;
1231                 default:
1232                         ASSERT(false);
1233                 }
1234
1235                 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1236                 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1237
1238                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1239                 {
1240                         current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1241                 }
1242
1243                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1244                 {
1245                         pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1246                 }
1247
1248                 switch(state.blendOperationAlpha)
1249                 {
1250                 case BLENDOP_ADD:
1251                         current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1252                         break;
1253                 case BLENDOP_SUB:
1254                         current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1255                         break;
1256                 case BLENDOP_INVSUB:
1257                         current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1258                         break;
1259                 case BLENDOP_MIN:
1260                         current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1261                         break;
1262                 case BLENDOP_MAX:
1263                         current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1264                         break;
1265                 case BLENDOP_SOURCE:
1266                         // No operation
1267                         break;
1268                 case BLENDOP_DEST:
1269                         current.w = pixel.w;
1270                         break;
1271                 case BLENDOP_NULL:
1272                         current.w = Short4(0x0000);
1273                         break;
1274                 default:
1275                         ASSERT(false);
1276                 }
1277         }
1278
1279         void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1280         {
1281                 if(state.logicalOperation == LOGICALOP_COPY)
1282                 {
1283                         return;
1284                 }
1285
1286                 Vector4s pixel;
1287                 readPixel(index, cBuffer, x, pixel);
1288
1289                 switch(state.logicalOperation)
1290                 {
1291                 case LOGICALOP_CLEAR:
1292                         current.x = UShort4(0);
1293                         current.y = UShort4(0);
1294                         current.z = UShort4(0);
1295                         break;
1296                 case LOGICALOP_SET:
1297                         current.x = UShort4(0xFFFFu);
1298                         current.y = UShort4(0xFFFFu);
1299                         current.z = UShort4(0xFFFFu);
1300                         break;
1301                 case LOGICALOP_COPY:
1302                         ASSERT(false);   // Optimized out
1303                         break;
1304                 case LOGICALOP_COPY_INVERTED:
1305                         current.x = ~current.x;
1306                         current.y = ~current.y;
1307                         current.z = ~current.z;
1308                         break;
1309                 case LOGICALOP_NOOP:
1310                         current.x = pixel.x;
1311                         current.y = pixel.y;
1312                         current.z = pixel.z;
1313                         break;
1314                 case LOGICALOP_INVERT:
1315                         current.x = ~pixel.x;
1316                         current.y = ~pixel.y;
1317                         current.z = ~pixel.z;
1318                         break;
1319                 case LOGICALOP_AND:
1320                         current.x = pixel.x & current.x;
1321                         current.y = pixel.y & current.y;
1322                         current.z = pixel.z & current.z;
1323                         break;
1324                 case LOGICALOP_NAND:
1325                         current.x = ~(pixel.x & current.x);
1326                         current.y = ~(pixel.y & current.y);
1327                         current.z = ~(pixel.z & current.z);
1328                         break;
1329                 case LOGICALOP_OR:
1330                         current.x = pixel.x | current.x;
1331                         current.y = pixel.y | current.y;
1332                         current.z = pixel.z | current.z;
1333                         break;
1334                 case LOGICALOP_NOR:
1335                         current.x = ~(pixel.x | current.x);
1336                         current.y = ~(pixel.y | current.y);
1337                         current.z = ~(pixel.z | current.z);
1338                         break;
1339                 case LOGICALOP_XOR:
1340                         current.x = pixel.x ^ current.x;
1341                         current.y = pixel.y ^ current.y;
1342                         current.z = pixel.z ^ current.z;
1343                         break;
1344                 case LOGICALOP_EQUIV:
1345                         current.x = ~(pixel.x ^ current.x);
1346                         current.y = ~(pixel.y ^ current.y);
1347                         current.z = ~(pixel.z ^ current.z);
1348                         break;
1349                 case LOGICALOP_AND_REVERSE:
1350                         current.x = ~pixel.x & current.x;
1351                         current.y = ~pixel.y & current.y;
1352                         current.z = ~pixel.z & current.z;
1353                         break;
1354                 case LOGICALOP_AND_INVERTED:
1355                         current.x = pixel.x & ~current.x;
1356                         current.y = pixel.y & ~current.y;
1357                         current.z = pixel.z & ~current.z;
1358                         break;
1359                 case LOGICALOP_OR_REVERSE:
1360                         current.x = ~pixel.x | current.x;
1361                         current.y = ~pixel.y | current.y;
1362                         current.z = ~pixel.z | current.z;
1363                         break;
1364                 case LOGICALOP_OR_INVERTED:
1365                         current.x = pixel.x | ~current.x;
1366                         current.y = pixel.y | ~current.y;
1367                         current.z = pixel.z | ~current.z;
1368                         break;
1369                 default:
1370                         ASSERT(false);
1371                 }
1372         }
1373
1374         void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1375         {
1376                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1377                 {
1378                         linearToSRGB16_12_16(current);
1379                 }
1380
1381                 if(exactColorRounding)
1382                 {
1383                         switch(state.targetFormat[index])
1384                         {
1385                         case FORMAT_R5G6B5:
1386                                 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1387                                 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1388                                 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1389                                 break;
1390                         case FORMAT_X8G8R8B8Q:
1391                         case FORMAT_A8G8R8B8Q:
1392                         case FORMAT_X8R8G8B8:
1393                         case FORMAT_X8B8G8R8:
1394                         case FORMAT_A8R8G8B8:
1395                         case FORMAT_A8B8G8R8:
1396                         case FORMAT_SRGB8_X8:
1397                         case FORMAT_SRGB8_A8:
1398                         case FORMAT_G8R8:
1399                         case FORMAT_R8:
1400                                 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1401                                 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1402                                 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1403                                 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1404                                 break;
1405                         default:
1406                                 break;
1407                         }
1408                 }
1409
1410                 int rgbaWriteMask = state.colorWriteActive(index);
1411                 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1412
1413                 switch(state.targetFormat[index])
1414                 {
1415                 case FORMAT_R5G6B5:
1416                         {
1417                                 current.x = current.x & Short4(0xF800u);
1418                                 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1419                                 current.z = As<UShort4>(current.z) >> 11;
1420
1421                                 current.x = current.x | current.y | current.z;
1422                         }
1423                         break;
1424                 case FORMAT_X8G8R8B8Q:
1425                         UNIMPLEMENTED();
1426                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1427                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1428                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1429
1430                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1431                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1432                         break;
1433                 case FORMAT_A8G8R8B8Q:
1434                         UNIMPLEMENTED();
1435                 //      current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1436                 //      current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1437                 //      current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1438                 //      current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1439
1440                 //      current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1441                 //      current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1442                         break;
1443                 case FORMAT_X8R8G8B8:
1444                 case FORMAT_A8R8G8B8:
1445                         if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1446                         {
1447                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1448                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1449                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1450
1451                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1452                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1453
1454                                 current.x = current.z;
1455                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1456                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1457                                 current.y = current.z;
1458                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1459                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1460                         }
1461                         else
1462                         {
1463                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1464                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1465                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1466                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1467
1468                                 current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1469                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1470
1471                                 current.x = current.z;
1472                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1473                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1474                                 current.y = current.z;
1475                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1476                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1477                         }
1478                         break;
1479                 case FORMAT_X8B8G8R8:
1480                 case FORMAT_A8B8G8R8:
1481                 case FORMAT_SRGB8_X8:
1482                 case FORMAT_SRGB8_A8:
1483                         if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1484                         {
1485                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1486                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1487                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1488
1489                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1490                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1491
1492                                 current.x = current.z;
1493                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1494                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1495                                 current.y = current.z;
1496                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1497                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1498                         }
1499                         else
1500                         {
1501                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1502                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1503                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1504                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1505
1506                                 current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1507                                 current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1508
1509                                 current.x = current.z;
1510                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1511                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1512                                 current.y = current.z;
1513                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1514                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1515                         }
1516                         break;
1517                 case FORMAT_G8R8:
1518                         current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1519                         current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1520                         current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1521                         current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1522                         current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1523                         break;
1524                 case FORMAT_R8:
1525                         current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1526                         current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1527                         break;
1528                 case FORMAT_A8:
1529                         current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1530                         current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1531                         break;
1532                 case FORMAT_G16R16:
1533                         current.z = current.x;
1534                         current.x = As<Short4>(UnpackLow(current.x, current.y));
1535                         current.z = As<Short4>(UnpackHigh(current.z, current.y));
1536                         current.y = current.z;
1537                         break;
1538                 case FORMAT_A16B16G16R16:
1539                         transpose4x4(current.x, current.y, current.z, current.w);
1540                         break;
1541                 default:
1542                         ASSERT(false);
1543                 }
1544
1545                 Short4 c01 = current.z;
1546                 Short4 c23 = current.y;
1547
1548                 Int xMask;   // Combination of all masks
1549
1550                 if(state.depthTestActive)
1551                 {
1552                         xMask = zMask;
1553                 }
1554                 else
1555                 {
1556                         xMask = cMask;
1557                 }
1558
1559                 if(state.stencilActive)
1560                 {
1561                         xMask &= sMask;
1562                 }
1563
1564                 switch(state.targetFormat[index])
1565                 {
1566                 case FORMAT_R5G6B5:
1567                         {
1568                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1569                                 Int value = *Pointer<Int>(buffer);
1570
1571                                 Int c01 = Extract(As<Int2>(current.x), 0);
1572
1573                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1574                                 {
1575                                         Int masked = value;
1576                                         c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1577                                         masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1578                                         c01 |= masked;
1579                                 }
1580
1581                                 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1582                                 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1583                                 c01 |= value;
1584                                 *Pointer<Int>(buffer) = c01;
1585
1586                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1587                                 value = *Pointer<Int>(buffer);
1588
1589                                 Int c23 = Extract(As<Int2>(current.x), 1);
1590
1591                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1592                                 {
1593                                         Int masked = value;
1594                                         c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1595                                         masked &= *Pointer<Int>(constants + OFFSET(Constants,invMask565Q[bgraWriteMask & 0x7][0]));
1596                                         c23 |= masked;
1597                                 }
1598
1599                                 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1600                                 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1601                                 c23 |= value;
1602                                 *Pointer<Int>(buffer) = c23;
1603                         }
1604                         break;
1605                 case FORMAT_A8G8R8B8Q:
1606                 case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1607                         UNIMPLEMENTED();
1608                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1609
1610                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1611                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1612                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1613                 //      {
1614                 //              Short4 masked = value;
1615                 //              c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1616                 //              masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1617                 //              c01 |= masked;
1618                 //      }
1619
1620                 //      c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1621                 //      value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1622                 //      c01 |= value;
1623                 //      *Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1624
1625                 //      value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1626
1627                 //      if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1628                 //         ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1629                 //          (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1630                 //      {
1631                 //              Short4 masked = value;
1632                 //              c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1633                 //              masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1634                 //              c23 |= masked;
1635                 //      }
1636
1637                 //      c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1638                 //      value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1639                 //      c23 |= value;
1640                 //      *Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1641                         break;
1642                 case FORMAT_A8R8G8B8:
1643                 case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1644                         {
1645                                 Pointer<Byte> buffer = cBuffer + x * 4;
1646                                 Short4 value = *Pointer<Short4>(buffer);
1647
1648                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1649                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1650                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1651                                 {
1652                                         Short4 masked = value;
1653                                         c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1654                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1655                                         c01 |= masked;
1656                                 }
1657
1658                                 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1659                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1660                                 c01 |= value;
1661                                 *Pointer<Short4>(buffer) = c01;
1662
1663                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1664                                 value = *Pointer<Short4>(buffer);
1665
1666                                 if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1667                                    ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1668                                         (state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1669                                 {
1670                                         Short4 masked = value;
1671                                         c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1672                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1673                                         c23 |= masked;
1674                                 }
1675
1676                                 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1677                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1678                                 c23 |= value;
1679                                 *Pointer<Short4>(buffer) = c23;
1680                         }
1681                         break;
1682                 case FORMAT_A8B8G8R8:
1683                 case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1684                 case FORMAT_SRGB8_X8:
1685                 case FORMAT_SRGB8_A8:
1686                         {
1687                                 Pointer<Byte> buffer = cBuffer + x * 4;
1688                                 Short4 value = *Pointer<Short4>(buffer);
1689
1690                                 bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1691                                               (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1692                                                ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1693
1694                                 if(masked)
1695                                 {
1696                                         Short4 masked = value;
1697                                         c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1698                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1699                                         c01 |= masked;
1700                                 }
1701
1702                                 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1703                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1704                                 c01 |= value;
1705                                 *Pointer<Short4>(buffer) = c01;
1706
1707                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1708                                 value = *Pointer<Short4>(buffer);
1709
1710                                 if(masked)
1711                                 {
1712                                         Short4 masked = value;
1713                                         c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1714                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1715                                         c23 |= masked;
1716                                 }
1717
1718                                 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1719                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1720                                 c23 |= value;
1721                                 *Pointer<Short4>(buffer) = c23;
1722                         }
1723                         break;
1724                 case FORMAT_G8R8:
1725                         if((rgbaWriteMask & 0x00000003) != 0x0)
1726                         {
1727                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1728                                 Int2 value;
1729                                 value = Insert(value, *Pointer<Int>(buffer), 0);
1730                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1731                                 value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1732
1733                                 Int2 packedCol = As<Int2>(current.x);
1734
1735                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1736                                 if((rgbaWriteMask & 0x3) != 0x3)
1737                                 {
1738                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1739                                         UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1740                                         mergedMask &= rgbaMask;
1741                                 }
1742
1743                                 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1744
1745                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1746                                 *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1747                         }
1748                         break;
1749                 case FORMAT_R8:
1750                         if(rgbaWriteMask & 0x00000001)
1751                         {
1752                                 Pointer<Byte> buffer = cBuffer + 1 * x;
1753                                 Short4 value;
1754                                 value = Insert(value, *Pointer<Short>(buffer), 0);
1755                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1756                                 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1757                                 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1758
1759                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1760                                 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1761                                 current.x |= value;
1762
1763                                 *Pointer<Short>(buffer) = Extract(current.x, 0);
1764                                 *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1765                         }
1766                         break;
1767                 case FORMAT_A8:
1768                         if(rgbaWriteMask & 0x00000008)
1769                         {
1770                                 Pointer<Byte> buffer = cBuffer + 1 * x;
1771                                 Short4 value;
1772                                 value = Insert(value, *Pointer<Short>(buffer), 0);
1773                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1774                                 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1775                                 value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1776
1777                                 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1778                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1779                                 current.w |= value;
1780
1781                                 *Pointer<Short>(buffer) = Extract(current.w, 0);
1782                                 *Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1783                         }
1784                         break;
1785                 case FORMAT_G16R16:
1786                         {
1787                                 Pointer<Byte> buffer = cBuffer + 4 * x;
1788
1789                                 Short4 value = *Pointer<Short4>(buffer);
1790
1791                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1792                                 {
1793                                         Short4 masked = value;
1794                                         current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1795                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1796                                         current.x |= masked;
1797                                 }
1798
1799                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1800                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1801                                 current.x |= value;
1802                                 *Pointer<Short4>(buffer) = current.x;
1803
1804                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1805
1806                                 value = *Pointer<Short4>(buffer);
1807
1808                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1809                                 {
1810                                         Short4 masked = value;
1811                                         current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1812                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW01Q[rgbaWriteMask & 0x3][0]));
1813                                         current.y |= masked;
1814                                 }
1815
1816                                 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1817                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1818                                 current.y |= value;
1819                                 *Pointer<Short4>(buffer) = current.y;
1820                         }
1821                         break;
1822                 case FORMAT_A16B16G16R16:
1823                         {
1824                                 Pointer<Byte> buffer = cBuffer + 8 * x;
1825
1826                                 {
1827                                         Short4 value = *Pointer<Short4>(buffer);
1828
1829                                         if(rgbaWriteMask != 0x0000000F)
1830                                         {
1831                                                 Short4 masked = value;
1832                                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1833                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1834                                                 current.x |= masked;
1835                                         }
1836
1837                                         current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1838                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1839                                         current.x |= value;
1840                                         *Pointer<Short4>(buffer) = current.x;
1841                                 }
1842
1843                                 {
1844                                         Short4 value = *Pointer<Short4>(buffer + 8);
1845
1846                                         if(rgbaWriteMask != 0x0000000F)
1847                                         {
1848                                                 Short4 masked = value;
1849                                                 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1850                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1851                                                 current.y |= masked;
1852                                         }
1853
1854                                         current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1855                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1856                                         current.y |= value;
1857                                         *Pointer<Short4>(buffer + 8) = current.y;
1858                                 }
1859
1860                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1861
1862                                 {
1863                                         Short4 value = *Pointer<Short4>(buffer);
1864
1865                                         if(rgbaWriteMask != 0x0000000F)
1866                                         {
1867                                                 Short4 masked = value;
1868                                                 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1869                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1870                                                 current.z |= masked;
1871                                         }
1872
1873                                         current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1874                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1875                                         current.z |= value;
1876                                         *Pointer<Short4>(buffer) = current.z;
1877                                 }
1878
1879                                 {
1880                                         Short4 value = *Pointer<Short4>(buffer + 8);
1881
1882                                         if(rgbaWriteMask != 0x0000000F)
1883                                         {
1884                                                 Short4 masked = value;
1885                                                 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1886                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1887                                                 current.w |= masked;
1888                                         }
1889
1890                                         current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1891                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1892                                         current.w |= value;
1893                                         *Pointer<Short4>(buffer + 8) = current.w;
1894                                 }
1895                         }
1896                         break;
1897                 default:
1898                         ASSERT(false);
1899                 }
1900         }
1901
1902         void PixelRoutine::blendFactor(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1903         {
1904                 switch(blendFactorActive)
1905                 {
1906                 case BLEND_ZERO:
1907                         // Optimized
1908                         break;
1909                 case BLEND_ONE:
1910                         // Optimized
1911                         break;
1912                 case BLEND_SOURCE:
1913                         blendFactor.x = oC.x;
1914                         blendFactor.y = oC.y;
1915                         blendFactor.z = oC.z;
1916                         break;
1917                 case BLEND_INVSOURCE:
1918                         blendFactor.x = Float4(1.0f) - oC.x;
1919                         blendFactor.y = Float4(1.0f) - oC.y;
1920                         blendFactor.z = Float4(1.0f) - oC.z;
1921                         break;
1922                 case BLEND_DEST:
1923                         blendFactor.x = pixel.x;
1924                         blendFactor.y = pixel.y;
1925                         blendFactor.z = pixel.z;
1926                         break;
1927                 case BLEND_INVDEST:
1928                         blendFactor.x = Float4(1.0f) - pixel.x;
1929                         blendFactor.y = Float4(1.0f) - pixel.y;
1930                         blendFactor.z = Float4(1.0f) - pixel.z;
1931                         break;
1932                 case BLEND_SOURCEALPHA:
1933                         blendFactor.x = oC.w;
1934                         blendFactor.y = oC.w;
1935                         blendFactor.z = oC.w;
1936                         break;
1937                 case BLEND_INVSOURCEALPHA:
1938                         blendFactor.x = Float4(1.0f) - oC.w;
1939                         blendFactor.y = Float4(1.0f) - oC.w;
1940                         blendFactor.z = Float4(1.0f) - oC.w;
1941                         break;
1942                 case BLEND_DESTALPHA:
1943                         blendFactor.x = pixel.w;
1944                         blendFactor.y = pixel.w;
1945                         blendFactor.z = pixel.w;
1946                         break;
1947                 case BLEND_INVDESTALPHA:
1948                         blendFactor.x = Float4(1.0f) - pixel.w;
1949                         blendFactor.y = Float4(1.0f) - pixel.w;
1950                         blendFactor.z = Float4(1.0f) - pixel.w;
1951                         break;
1952                 case BLEND_SRCALPHASAT:
1953                         blendFactor.x = Float4(1.0f) - pixel.w;
1954                         blendFactor.x = Min(blendFactor.x, oC.w);
1955                         blendFactor.y = blendFactor.x;
1956                         blendFactor.z = blendFactor.x;
1957                         break;
1958                 case BLEND_CONSTANT:
1959                         blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1960                         blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1961                         blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1962                         break;
1963                 case BLEND_INVCONSTANT:
1964                         blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1965                         blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1966                         blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1967                         break;
1968                 default:
1969                         ASSERT(false);
1970                 }
1971         }
1972
1973         void PixelRoutine::blendFactorAlpha(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1974         {
1975                 switch(blendFactorAlphaActive)
1976                 {
1977                 case BLEND_ZERO:
1978                         // Optimized
1979                         break;
1980                 case BLEND_ONE:
1981                         // Optimized
1982                         break;
1983                 case BLEND_SOURCE:
1984                         blendFactor.w = oC.w;
1985                         break;
1986                 case BLEND_INVSOURCE:
1987                         blendFactor.w = Float4(1.0f) - oC.w;
1988                         break;
1989                 case BLEND_DEST:
1990                         blendFactor.w = pixel.w;
1991                         break;
1992                 case BLEND_INVDEST:
1993                         blendFactor.w = Float4(1.0f) - pixel.w;
1994                         break;
1995                 case BLEND_SOURCEALPHA:
1996                         blendFactor.w = oC.w;
1997                         break;
1998                 case BLEND_INVSOURCEALPHA:
1999                         blendFactor.w = Float4(1.0f) - oC.w;
2000                         break;
2001                 case BLEND_DESTALPHA:
2002                         blendFactor.w = pixel.w;
2003                         break;
2004                 case BLEND_INVDESTALPHA:
2005                         blendFactor.w = Float4(1.0f) - pixel.w;
2006                         break;
2007                 case BLEND_SRCALPHASAT:
2008                         blendFactor.w = Float4(1.0f);
2009                         break;
2010                 case BLEND_CONSTANT:
2011                         blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2012                         break;
2013                 case BLEND_INVCONSTANT:
2014                         blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2015                         break;
2016                 default:
2017                         ASSERT(false);
2018                 }
2019         }
2020
2021         void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2022         {
2023                 if(!state.alphaBlendActive)
2024                 {
2025                         return;
2026                 }
2027
2028                 Pointer<Byte> buffer;
2029                 Vector4f pixel;
2030
2031                 Vector4s color;
2032                 Short4 c01;
2033                 Short4 c23;
2034
2035                 Float4 one;
2036                 if(Surface::isFloatFormat(state.targetFormat[index]))
2037                 {
2038                         one = Float4(1.0f);
2039                 }
2040                 else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2041                 {
2042                         one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2043                 }
2044
2045                 switch(state.targetFormat[index])
2046                 {
2047                 case FORMAT_R32I:
2048                 case FORMAT_R32UI:
2049                 case FORMAT_R32F:
2050                         buffer = cBuffer;
2051                         // FIXME: movlps
2052                         pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2053                         pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2054                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2055                         // FIXME: movhps
2056                         pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2057                         pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2058                         pixel.y = pixel.z = pixel.w = one;
2059                         break;
2060                 case FORMAT_G32R32I:
2061                 case FORMAT_G32R32UI:
2062                 case FORMAT_G32R32F:
2063                         buffer = cBuffer;
2064                         pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2065                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2066                         pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2067                         pixel.z = pixel.x;
2068                         pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2069                         pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2070                         pixel.y = pixel.z;
2071                         pixel.z = pixel.w = one;
2072                         break;
2073                 case FORMAT_X32B32G32R32F:
2074                 case FORMAT_A32B32G32R32F:
2075                 case FORMAT_A32B32G32R32I:
2076                 case FORMAT_A32B32G32R32UI:
2077                         buffer = cBuffer;
2078                         pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2079                         pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2080                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2081                         pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2082                         pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2083                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2084                         if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2085                         {
2086                                 pixel.w = Float4(1.0f);
2087                         }
2088                         break;
2089                 default:
2090                         ASSERT(false);
2091                 }
2092
2093                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2094                 {
2095                         sRGBtoLinear(pixel.x);
2096                         sRGBtoLinear(pixel.y);
2097                         sRGBtoLinear(pixel.z);
2098                 }
2099
2100                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2101                 Vector4f sourceFactor;
2102                 Vector4f destFactor;
2103
2104                 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2105                 blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2106
2107                 if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2108                 {
2109                         oC.x *= sourceFactor.x;
2110                         oC.y *= sourceFactor.y;
2111                         oC.z *= sourceFactor.z;
2112                 }
2113
2114                 if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2115                 {
2116                         pixel.x *= destFactor.x;
2117                         pixel.y *= destFactor.y;
2118                         pixel.z *= destFactor.z;
2119                 }
2120
2121                 switch(state.blendOperation)
2122                 {
2123                 case BLENDOP_ADD:
2124                         oC.x += pixel.x;
2125                         oC.y += pixel.y;
2126                         oC.z += pixel.z;
2127                         break;
2128                 case BLENDOP_SUB:
2129                         oC.x -= pixel.x;
2130                         oC.y -= pixel.y;
2131                         oC.z -= pixel.z;
2132                         break;
2133                 case BLENDOP_INVSUB:
2134                         oC.x = pixel.x - oC.x;
2135                         oC.y = pixel.y - oC.y;
2136                         oC.z = pixel.z - oC.z;
2137                         break;
2138                 case BLENDOP_MIN:
2139                         oC.x = Min(oC.x, pixel.x);
2140                         oC.y = Min(oC.y, pixel.y);
2141                         oC.z = Min(oC.z, pixel.z);
2142                         break;
2143                 case BLENDOP_MAX:
2144                         oC.x = Max(oC.x, pixel.x);
2145                         oC.y = Max(oC.y, pixel.y);
2146                         oC.z = Max(oC.z, pixel.z);
2147                         break;
2148                 case BLENDOP_SOURCE:
2149                         // No operation
2150                         break;
2151                 case BLENDOP_DEST:
2152                         oC.x = pixel.x;
2153                         oC.y = pixel.y;
2154                         oC.z = pixel.z;
2155                         break;
2156                 case BLENDOP_NULL:
2157                         oC.x = Float4(0.0f);
2158                         oC.y = Float4(0.0f);
2159                         oC.z = Float4(0.0f);
2160                         break;
2161                 default:
2162                         ASSERT(false);
2163                 }
2164
2165                 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2166                 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2167
2168                 if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2169                 {
2170                         oC.w *= sourceFactor.w;
2171                 }
2172
2173                 if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2174                 {
2175                         pixel.w *= destFactor.w;
2176                 }
2177
2178                 switch(state.blendOperationAlpha)
2179                 {
2180                 case BLENDOP_ADD:
2181                         oC.w += pixel.w;
2182                         break;
2183                 case BLENDOP_SUB:
2184                         oC.w -= pixel.w;
2185                         break;
2186                 case BLENDOP_INVSUB:
2187                         pixel.w -= oC.w;
2188                         oC.w = pixel.w;
2189                         break;
2190                 case BLENDOP_MIN:
2191                         oC.w = Min(oC.w, pixel.w);
2192                         break;
2193                 case BLENDOP_MAX:
2194                         oC.w = Max(oC.w, pixel.w);
2195                         break;
2196                 case BLENDOP_SOURCE:
2197                         // No operation
2198                         break;
2199                 case BLENDOP_DEST:
2200                         oC.w = pixel.w;
2201                         break;
2202                 case BLENDOP_NULL:
2203                         oC.w = Float4(0.0f);
2204                         break;
2205                 default:
2206                         ASSERT(false);
2207                 }
2208         }
2209
2210         void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2211         {
2212                 switch(state.targetFormat[index])
2213                 {
2214                 case FORMAT_R32F:
2215                 case FORMAT_R32I:
2216                 case FORMAT_R32UI:
2217                 case FORMAT_R16I:
2218                 case FORMAT_R16UI:
2219                 case FORMAT_R8I:
2220                 case FORMAT_R8UI:
2221                         break;
2222                 case FORMAT_G32R32F:
2223                 case FORMAT_G32R32I:
2224                 case FORMAT_G32R32UI:
2225                 case FORMAT_G16R16I:
2226                 case FORMAT_G16R16UI:
2227                 case FORMAT_G8R8I:
2228                 case FORMAT_G8R8UI:
2229                         oC.z = oC.x;
2230                         oC.x = UnpackLow(oC.x, oC.y);
2231                         oC.z = UnpackHigh(oC.z, oC.y);
2232                         oC.y = oC.z;
2233                         break;
2234                 case FORMAT_X32B32G32R32F:
2235                 case FORMAT_A32B32G32R32F:
2236                 case FORMAT_A32B32G32R32I:
2237                 case FORMAT_A32B32G32R32UI:
2238                 case FORMAT_A16B16G16R16I:
2239                 case FORMAT_A16B16G16R16UI:
2240                 case FORMAT_A8B8G8R8I:
2241                 case FORMAT_A8B8G8R8UI:
2242                         transpose4x4(oC.x, oC.y, oC.z, oC.w);
2243                         break;
2244                 default:
2245                         ASSERT(false);
2246                 }
2247
2248                 int rgbaWriteMask = state.colorWriteActive(index);
2249
2250                 Int xMask;   // Combination of all masks
2251
2252                 if(state.depthTestActive)
2253                 {
2254                         xMask = zMask;
2255                 }
2256                 else
2257                 {
2258                         xMask = cMask;
2259                 }
2260
2261                 if(state.stencilActive)
2262                 {
2263                         xMask &= sMask;
2264                 }
2265
2266                 Pointer<Byte> buffer;
2267                 Float4 value;
2268
2269                 switch(state.targetFormat[index])
2270                 {
2271                 case FORMAT_R32F:
2272                 case FORMAT_R32I:
2273                 case FORMAT_R32UI:
2274                         if(rgbaWriteMask & 0x00000001)
2275                         {
2276                                 buffer = cBuffer + 4 * x;
2277
2278                                 // FIXME: movlps
2279                                 value.x = *Pointer<Float>(buffer + 0);
2280                                 value.y = *Pointer<Float>(buffer + 4);
2281
2282                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2283
2284                                 // FIXME: movhps
2285                                 value.z = *Pointer<Float>(buffer + 0);
2286                                 value.w = *Pointer<Float>(buffer + 4);
2287
2288                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2289                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2290                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2291
2292                                 // FIXME: movhps
2293                                 *Pointer<Float>(buffer + 0) = oC.x.z;
2294                                 *Pointer<Float>(buffer + 4) = oC.x.w;
2295
2296                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2297
2298                                 // FIXME: movlps
2299                                 *Pointer<Float>(buffer + 0) = oC.x.x;
2300                                 *Pointer<Float>(buffer + 4) = oC.x.y;
2301                         }
2302                         break;
2303                 case FORMAT_R16I:
2304                 case FORMAT_R16UI:
2305                         if(rgbaWriteMask & 0x00000001)
2306                         {
2307                                 buffer = cBuffer + 2 * x;
2308
2309                                 UShort4 xyzw;
2310                                 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2311
2312                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2313
2314                                 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2315                                 value = As<Float4>(Int4(xyzw));
2316
2317                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2318                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2319                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2320
2321                                 if(state.targetFormat[index] == FORMAT_R16I)
2322                                 {
2323                                         Float component = oC.x.z;
2324                                         *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2325                                         component = oC.x.w;
2326                                         *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2327
2328                                         buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2329
2330                                         component = oC.x.x;
2331                                         *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2332                                         component = oC.x.y;
2333                                         *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2334                                 }
2335                                 else // FORMAT_R16UI
2336                                 {
2337                                         Float component = oC.x.z;
2338                                         *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2339                                         component = oC.x.w;
2340                                         *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2341
2342                                         buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2343
2344                                         component = oC.x.x;
2345                                         *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2346                                         component = oC.x.y;
2347                                         *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2348                                 }
2349                         }
2350                         break;
2351                 case FORMAT_R8I:
2352                 case FORMAT_R8UI:
2353                         if(rgbaWriteMask & 0x00000001)
2354                         {
2355                                 buffer = cBuffer + x;
2356
2357                                 UInt xyzw, packedCol;
2358
2359                                 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2360                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2361                                 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2362
2363                                 Short4 tmpCol = Short4(As<Int4>(oC.x));
2364                                 if(state.targetFormat[index] == FORMAT_R8I)
2365                                 {
2366                                         tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
2367                                 }
2368                                 else
2369                                 {
2370                                         tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
2371                                 }
2372                                 packedCol = Extract(As<Int2>(tmpCol), 0);
2373
2374                                 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2375                                             (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2376
2377                                 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2378                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2379                                 *Pointer<UShort>(buffer) = UShort(packedCol);
2380                         }
2381                         break;
2382                 case FORMAT_G32R32F:
2383                 case FORMAT_G32R32I:
2384                 case FORMAT_G32R32UI:
2385                         buffer = cBuffer + 8 * x;
2386
2387                         value = *Pointer<Float4>(buffer);
2388
2389                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2390                         {
2391                                 Float4 masked = value;
2392                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2393                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2394                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2395                         }
2396
2397                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2398                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2399                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2400                         *Pointer<Float4>(buffer) = oC.x;
2401
2402                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2403
2404                         value = *Pointer<Float4>(buffer);
2405
2406                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2407                         {
2408                                 Float4 masked;
2409
2410                                 masked = value;
2411                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2412                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD01X[rgbaWriteMask & 0x3][0])));
2413                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2414                         }
2415
2416                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2417                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2418                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2419                         *Pointer<Float4>(buffer) = oC.y;
2420                         break;
2421                 case FORMAT_G16R16I:
2422                 case FORMAT_G16R16UI:
2423                         if((rgbaWriteMask & 0x00000003) != 0x0)
2424                         {
2425                                 buffer = cBuffer + 4 * x;
2426
2427                                 UInt2 rgbaMask;
2428                                 UShort4 packedCol = UShort4(As<Int4>(oC.x));
2429                                 UShort4 value = *Pointer<UShort4>(buffer);
2430                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2431                                 if((rgbaWriteMask & 0x3) != 0x3)
2432                                 {
2433                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2434                                         rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2435                                         mergedMask &= rgbaMask;
2436                                 }
2437                                 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2438
2439                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2440
2441                                 packedCol = UShort4(As<Int4>(oC.y));
2442                                 value = *Pointer<UShort4>(buffer);
2443                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2444                                 if((rgbaWriteMask & 0x3) != 0x3)
2445                                 {
2446                                         mergedMask &= rgbaMask;
2447                                 }
2448                                 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2449                         }
2450                         break;
2451                 case FORMAT_G8R8I:
2452                 case FORMAT_G8R8UI:
2453                         if((rgbaWriteMask & 0x00000003) != 0x0)
2454                         {
2455                                 buffer = cBuffer + 2 * x;
2456
2457                                 Int2 xyzw, packedCol;
2458
2459                                 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2460                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2461                                 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2462
2463                                 if(state.targetFormat[index] == FORMAT_G8R8I)
2464                                 {
2465                                         packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2466                                 }
2467                                 else
2468                                 {
2469                                         packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2470                                 }
2471
2472                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2473                                 if((rgbaWriteMask & 0x3) != 0x3)
2474                                 {
2475                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2476                                         UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2477                                         mergedMask &= rgbaMask;
2478                                 }
2479
2480                                 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2481
2482                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2483                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2484                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2485                         }
2486                         break;
2487                 case FORMAT_X32B32G32R32F:
2488                 case FORMAT_A32B32G32R32F:
2489                 case FORMAT_A32B32G32R32I:
2490                 case FORMAT_A32B32G32R32UI:
2491                         buffer = cBuffer + 16 * x;
2492
2493                         {
2494                                 value = *Pointer<Float4>(buffer, 16);
2495
2496                                 if(rgbaWriteMask != 0x0000000F)
2497                                 {
2498                                         Float4 masked = value;
2499                                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2500                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2501                                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2502                                 }
2503
2504                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2505                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2506                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2507                                 *Pointer<Float4>(buffer, 16) = oC.x;
2508                         }
2509
2510                         {
2511                                 value = *Pointer<Float4>(buffer + 16, 16);
2512
2513                                 if(rgbaWriteMask != 0x0000000F)
2514                                 {
2515                                         Float4 masked = value;
2516                                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2517                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2518                                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2519                                 }
2520
2521                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2522                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2523                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2524                                 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2525                         }
2526
2527                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2528
2529                         {
2530                                 value = *Pointer<Float4>(buffer, 16);
2531
2532                                 if(rgbaWriteMask != 0x0000000F)
2533                                 {
2534                                         Float4 masked = value;
2535                                         oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2536                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2537                                         oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2538                                 }
2539
2540                                 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2541                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2542                                 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2543                                 *Pointer<Float4>(buffer, 16) = oC.z;
2544                         }
2545
2546                         {
2547                                 value = (state.targetFormat[index] == FORMAT_X32B32G32R32F) ? Float4(1.0f) : *Pointer<Float4>(buffer + 16, 16);
2548
2549                                 if(rgbaWriteMask != 0x0000000F)
2550                                 {
2551                                         Float4 masked = value;
2552                                         oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2553                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2554                                         oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2555                                 }
2556
2557                                 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2558                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2559                                 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2560                                 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2561                         }
2562                         break;
2563                 case FORMAT_A16B16G16R16I:
2564                 case FORMAT_A16B16G16R16UI:
2565                         if((rgbaWriteMask & 0x0000000F) != 0x0)
2566                         {
2567                                 buffer = cBuffer + 8 * x;
2568
2569                                 UInt4 rgbaMask;
2570                                 UShort8 value = *Pointer<UShort8>(buffer);
2571                                 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2572                                 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2573                                 if((rgbaWriteMask & 0xF) != 0xF)
2574                                 {
2575                                         UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2576                                         rgbaMask = UInt4(tmpMask, tmpMask);
2577                                         mergedMask &= rgbaMask;
2578                                 }
2579                                 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2580
2581                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2582
2583                                 value = *Pointer<UShort8>(buffer);
2584                                 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2585                                 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2586                                 if((rgbaWriteMask & 0xF) != 0xF)
2587                                 {
2588                                         mergedMask &= rgbaMask;
2589                                 }
2590                                 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2591                         }
2592                         break;
2593                 case FORMAT_A8B8G8R8I:
2594                 case FORMAT_A8B8G8R8UI:
2595                         if((rgbaWriteMask & 0x0000000F) != 0x0)
2596                         {
2597                                 UInt2 value, packedCol, mergedMask;
2598
2599                                 buffer = cBuffer + 4 * x;
2600
2601                                 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2602                                 {
2603                                         packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2604                                 }
2605                                 else
2606                                 {
2607                                         packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2608                                 }
2609                                 value = *Pointer<UInt2>(buffer, 16);
2610                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2611                                 if(rgbaWriteMask != 0xF)
2612                                 {
2613                                         mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2614                                 }
2615                                 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2616
2617                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2618
2619                                 if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2620                                 {
2621                                         packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2622                                 }
2623                                 else
2624                                 {
2625                                         packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
2626                                 }
2627                                 value = *Pointer<UInt2>(buffer, 16);
2628                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2629                                 if(rgbaWriteMask != 0xF)
2630                                 {
2631                                         mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2632                                 }
2633                                 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2634                         }
2635                         break;
2636                 default:
2637                         ASSERT(false);
2638                 }
2639         }
2640
2641         UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2642         {
2643                 return UShort4(cf * Float4(0xFFFF), saturate);
2644         }
2645
2646         void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2647         {
2648                 c.x = As<UShort4>(c.x) >> 4;
2649                 c.y = As<UShort4>(c.y) >> 4;
2650                 c.z = As<UShort4>(c.z) >> 4;
2651
2652                 sRGBtoLinear12_16(c);
2653         }
2654
2655         void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2656         {
2657                 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2658
2659                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2660                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2661                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2662                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2663
2664                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2665                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2666                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2667                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2668
2669                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2670                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2671                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2672                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2673         }
2674
2675         void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2676         {
2677                 c.x = As<UShort4>(c.x) >> 4;
2678                 c.y = As<UShort4>(c.y) >> 4;
2679                 c.z = As<UShort4>(c.z) >> 4;
2680
2681                 linearToSRGB12_16(c);
2682         }
2683
2684         void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2685         {
2686                 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2687
2688                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2689                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2690                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2691                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2692
2693                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2694                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2695                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2696                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2697
2698                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2699                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2700                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2701                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2702         }
2703
2704         Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2705         {
2706                 Float4 linear = x * x;
2707                 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2708
2709                 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2710         }
2711
2712         bool PixelRoutine::colorUsed()
2713         {
2714                 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2715         }
2716 }