OSDN Git Service

Removed SwiftShader's custom Blending enums
[android-x86/external-swiftshader.git] / src / Pipeline / PixelRoutine.cpp
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelRoutine.hpp"
16
17 #include "SamplerCore.hpp"
18 #include "Constants.hpp"
19 #include "Device/Renderer.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Surface.hpp"
22 #include "Device/Primitive.hpp"
23 #include "System/Debug.hpp"
24
25 namespace sw
26 {
27         extern bool complementaryDepthBuffer;
28         extern bool postBlendSRGB;
29         extern bool exactColorRounding;
30         extern bool forceClearRegisters;
31
32         PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader)
33                 : QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput)
34         {
35                 if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
36                 {
37                         for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
38                         {
39                                 v[i].x = Float4(0.0f);
40                                 v[i].y = Float4(0.0f);
41                                 v[i].z = Float4(0.0f);
42                                 v[i].w = Float4(0.0f);
43                         }
44                 }
45         }
46
47         PixelRoutine::~PixelRoutine()
48         {
49         }
50
51         void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
52         {
53                 #if PERF_PROFILE
54                         Long pipeTime = Ticks();
55                 #endif
56
57                 const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
58
59                 Int zMask[4];   // Depth mask
60                 Int sMask[4];   // Stencil mask
61
62                 for(unsigned int q = 0; q < state.multiSample; q++)
63                 {
64                         zMask[q] = cMask[q];
65                         sMask[q] = cMask[q];
66                 }
67
68                 for(unsigned int q = 0; q < state.multiSample; q++)
69                 {
70                         stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
71                 }
72
73                 Float4 f;
74                 Float4 rhwCentroid;
75
76                 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
77
78                 if(interpolateZ())
79                 {
80                         for(unsigned int q = 0; q < state.multiSample; q++)
81                         {
82                                 Float4 x = xxxx;
83
84                                 if(state.multiSample > 1)
85                                 {
86                                         x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
87                                 }
88
89                                 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
90                         }
91                 }
92
93                 Bool depthPass = false;
94
95                 if(earlyDepthTest)
96                 {
97                         for(unsigned int q = 0; q < state.multiSample; q++)
98                         {
99                                 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
100                         }
101                 }
102
103                 If(depthPass || Bool(!earlyDepthTest))
104                 {
105                         #if PERF_PROFILE
106                                 Long interpTime = Ticks();
107                         #endif
108
109                         Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
110
111                         // Centroid locations
112                         Float4 XXXX = Float4(0.0f);
113                         Float4 YYYY = Float4(0.0f);
114
115                         if(state.centroid)
116                         {
117                                 Float4 WWWW(1.0e-9f);
118
119                                 for(unsigned int q = 0; q < state.multiSample; q++)
120                                 {
121                                         XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
122                                         YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
123                                         WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
124                                 }
125
126                                 WWWW = Rcp_pp(WWWW);
127                                 XXXX *= WWWW;
128                                 YYYY *= WWWW;
129
130                                 XXXX += xxxx;
131                                 YYYY += yyyy;
132                         }
133
134                         if(interpolateW())
135                         {
136                                 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
137                                 rhw = reciprocal(w, false, false, true);
138
139                                 if(state.centroid)
140                                 {
141                                         rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
142                                 }
143                         }
144
145                         for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
146                         {
147                                 for(int component = 0; component < 4; component++)
148                                 {
149                                         if(state.interpolant[interpolant].component & (1 << component))
150                                         {
151                                                 if(!state.interpolant[interpolant].centroid)
152                                                 {
153                                                         v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false);
154                                                 }
155                                                 else
156                                                 {
157                                                         v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
158                                                 }
159                                         }
160                                 }
161
162                                 Float4 rcp;
163
164                                 switch(state.interpolant[interpolant].project)
165                                 {
166                                 case 0:
167                                         break;
168                                 case 1:
169                                         rcp = reciprocal(v[interpolant].y);
170                                         v[interpolant].x = v[interpolant].x * rcp;
171                                         break;
172                                 case 2:
173                                         rcp = reciprocal(v[interpolant].z);
174                                         v[interpolant].x = v[interpolant].x * rcp;
175                                         v[interpolant].y = v[interpolant].y * rcp;
176                                         break;
177                                 case 3:
178                                         rcp = reciprocal(v[interpolant].w);
179                                         v[interpolant].x = v[interpolant].x * rcp;
180                                         v[interpolant].y = v[interpolant].y * rcp;
181                                         v[interpolant].z = v[interpolant].z * rcp;
182                                         break;
183                                 }
184                         }
185
186                         if(state.fog.component)
187                         {
188                                 f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false);
189                         }
190
191                         setBuiltins(x, y, z, w);
192
193                         #if PERF_PROFILE
194                                 cycles[PERF_INTERP] += Ticks() - interpTime;
195                         #endif
196
197                         Bool alphaPass = true;
198
199                         if(colorUsed())
200                         {
201                                 #if PERF_PROFILE
202                                         Long shaderTime = Ticks();
203                                 #endif
204
205                                 applyShader(cMask);
206
207                                 #if PERF_PROFILE
208                                         cycles[PERF_SHADER] += Ticks() - shaderTime;
209                                 #endif
210
211                                 alphaPass = alphaTest(cMask);
212
213                                 if((shader && shader->containsKill()) || state.alphaTestActive())
214                                 {
215                                         for(unsigned int q = 0; q < state.multiSample; q++)
216                                         {
217                                                 zMask[q] &= cMask[q];
218                                                 sMask[q] &= cMask[q];
219                                         }
220                                 }
221                         }
222
223                         If(alphaPass)
224                         {
225                                 if(!earlyDepthTest)
226                                 {
227                                         for(unsigned int q = 0; q < state.multiSample; q++)
228                                         {
229                                                 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
230                                         }
231                                 }
232
233                                 #if PERF_PROFILE
234                                         Long ropTime = Ticks();
235                                 #endif
236
237                                 If(depthPass || Bool(earlyDepthTest))
238                                 {
239                                         for(unsigned int q = 0; q < state.multiSample; q++)
240                                         {
241                                                 if(state.multiSampleMask & (1 << q))
242                                                 {
243                                                         writeDepth(zBuffer, q, x, z[q], zMask[q]);
244
245                                                         if(state.occlusionEnabled)
246                                                         {
247                                                                 occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
248                                                         }
249                                                 }
250                                         }
251
252                                         if(colorUsed())
253                                         {
254                                                 #if PERF_PROFILE
255                                                         AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
256                                                 #endif
257
258                                                 rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
259                                         }
260                                 }
261
262                                 #if PERF_PROFILE
263                                         cycles[PERF_ROP] += Ticks() - ropTime;
264                                 #endif
265                         }
266                 }
267
268                 for(unsigned int q = 0; q < state.multiSample; q++)
269                 {
270                         if(state.multiSampleMask & (1 << q))
271                         {
272                                 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
273                         }
274                 }
275
276                 #if PERF_PROFILE
277                         cycles[PERF_PIPE] += Ticks() - pipeTime;
278                 #endif
279         }
280
281         Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
282         {
283                 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
284
285                 if(!flat)
286                 {
287                         interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
288                                        y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
289
290                         if(perspective)
291                         {
292                                 interpolant *= rhw;
293                         }
294                 }
295
296                 return interpolant;
297         }
298
299         void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
300         {
301                 if(!state.stencilActive)
302                 {
303                         return;
304                 }
305
306                 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
307
308                 Pointer<Byte> buffer = sBuffer + 2 * x;
309
310                 if(q > 0)
311                 {
312                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
313                 }
314
315                 Byte8 value = *Pointer<Byte8>(buffer);
316                 Byte8 valueCCW = value;
317
318                 if(!state.noStencilMask)
319                 {
320                         value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
321                 }
322
323                 stencilTest(value, state.stencilCompareMode, false);
324
325                 if(state.twoSidedStencil)
326                 {
327                         if(!state.noStencilMaskCCW)
328                         {
329                                 valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
330                         }
331
332                         stencilTest(valueCCW, state.stencilCompareModeCCW, true);
333
334                         value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
335                         valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
336                         value |= valueCCW;
337                 }
338
339                 sMask = SignMask(value) & cMask;
340         }
341
342         void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool CCW)
343         {
344                 Byte8 equal;
345
346                 switch(stencilCompareMode)
347                 {
348                 case VK_COMPARE_OP_ALWAYS:
349                         value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
350                         break;
351                 case VK_COMPARE_OP_NEVER:
352                         value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
353                         break;
354                 case VK_COMPARE_OP_LESS:                        // a < b ~ b > a
355                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
356                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
357                         break;
358                 case VK_COMPARE_OP_EQUAL:
359                         value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
360                         break;
361                 case VK_COMPARE_OP_NOT_EQUAL:           // a != b ~ !(a == b)
362                         value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
363                         value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
364                         break;
365                 case VK_COMPARE_OP_LESS_OR_EQUAL:       // a <= b ~ (b > a) || (a == b)
366                         equal = value;
367                         equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
368                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
369                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
370                         value |= equal;
371                         break;
372                 case VK_COMPARE_OP_GREATER:             // a > b
373                         equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
374                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
375                         equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
376                         value = equal;
377                         break;
378                 case VK_COMPARE_OP_GREATER_OR_EQUAL:    // a >= b ~ !(a < b) ~ !(b > a)
379                         value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
380                         value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
381                         value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
382                         break;
383                 default:
384                         ASSERT(false);
385                 }
386         }
387
388         Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
389         {
390                 if(!state.depthTestActive)
391                 {
392                         return true;
393                 }
394
395                 Float4 Z = z;
396
397                 if(shader && shader->depthOverride())
398                 {
399                         if(complementaryDepthBuffer)
400                         {
401                                 Z = Float4(1.0f) - oDepth;
402                         }
403                         else
404                         {
405                                 Z = oDepth;
406                         }
407                 }
408
409                 Pointer<Byte> buffer;
410                 Int pitch;
411
412                 if(!state.quadLayoutDepthBuffer)
413                 {
414                         buffer = zBuffer + 4 * x;
415                         pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
416                 }
417                 else
418                 {
419                         buffer = zBuffer + 8 * x;
420                 }
421
422                 if(q > 0)
423                 {
424                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
425                 }
426
427                 Float4 zValue;
428
429                 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
430                 {
431                         if(!state.quadLayoutDepthBuffer)
432                         {
433                                 // FIXME: Properly optimizes?
434                                 zValue.xy = *Pointer<Float4>(buffer);
435                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
436                         }
437                         else
438                         {
439                                 zValue = *Pointer<Float4>(buffer, 16);
440                         }
441                 }
442
443                 Int4 zTest;
444
445                 switch(state.depthCompareMode)
446                 {
447                 case VK_COMPARE_OP_ALWAYS:
448                         // Optimized
449                         break;
450                 case VK_COMPARE_OP_NEVER:
451                         // Optimized
452                         break;
453                 case VK_COMPARE_OP_EQUAL:
454                         zTest = CmpEQ(zValue, Z);
455                         break;
456                 case VK_COMPARE_OP_NOT_EQUAL:
457                         zTest = CmpNEQ(zValue, Z);
458                         break;
459                 case VK_COMPARE_OP_LESS:
460                         if(complementaryDepthBuffer)
461                         {
462                                 zTest = CmpLT(zValue, Z);
463                         }
464                         else
465                         {
466                                 zTest = CmpNLE(zValue, Z);
467                         }
468                         break;
469                 case VK_COMPARE_OP_GREATER_OR_EQUAL:
470                         if(complementaryDepthBuffer)
471                         {
472                                 zTest = CmpNLT(zValue, Z);
473                         }
474                         else
475                         {
476                                 zTest = CmpLE(zValue, Z);
477                         }
478                         break;
479                 case VK_COMPARE_OP_LESS_OR_EQUAL:
480                         if(complementaryDepthBuffer)
481                         {
482                                 zTest = CmpLE(zValue, Z);
483                         }
484                         else
485                         {
486                                 zTest = CmpNLT(zValue, Z);
487                         }
488                         break;
489                 case VK_COMPARE_OP_GREATER:
490                         if(complementaryDepthBuffer)
491                         {
492                                 zTest = CmpNLE(zValue, Z);
493                         }
494                         else
495                         {
496                                 zTest = CmpLT(zValue, Z);
497                         }
498                         break;
499                 default:
500                         ASSERT(false);
501                 }
502
503                 switch(state.depthCompareMode)
504                 {
505                 case VK_COMPARE_OP_ALWAYS:
506                         zMask = cMask;
507                         break;
508                 case VK_COMPARE_OP_NEVER:
509                         zMask = 0x0;
510                         break;
511                 default:
512                         zMask = SignMask(zTest) & cMask;
513                         break;
514                 }
515
516                 if(state.stencilActive)
517                 {
518                         zMask &= sMask;
519                 }
520
521                 return zMask != 0;
522         }
523
524         void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
525         {
526                 Short4 cmp;
527                 Short4 equal;
528
529                 switch(state.alphaCompareMode)
530                 {
531                 case VK_COMPARE_OP_ALWAYS:
532                         aMask = 0xF;
533                         break;
534                 case VK_COMPARE_OP_NEVER:
535                         aMask = 0x0;
536                         break;
537                 case VK_COMPARE_OP_EQUAL:
538                         cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
539                         aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
540                         break;
541                 case VK_COMPARE_OP_NOT_EQUAL:       // a != b ~ !(a == b)
542                         cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
543                         aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
544                         break;
545                 case VK_COMPARE_OP_LESS:           // a < b ~ b > a
546                         cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
547                         aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
548                         break;
549                 case VK_COMPARE_OP_GREATER_OR_EQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
550                         equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
551                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
552                         cmp |= equal;
553                         aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
554                         break;
555                 case VK_COMPARE_OP_LESS_OR_EQUAL:      // a <= b ~ !(a > b)
556                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
557                         aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
558                         break;
559                 case VK_COMPARE_OP_GREATER:        // a > b
560                         cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
561                         aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
562                         break;
563                 default:
564                         ASSERT(false);
565                 }
566         }
567
568         void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
569         {
570                 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
571                 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
572                 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
573                 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
574
575                 Int aMask0 = SignMask(coverage0);
576                 Int aMask1 = SignMask(coverage1);
577                 Int aMask2 = SignMask(coverage2);
578                 Int aMask3 = SignMask(coverage3);
579
580                 cMask[0] &= aMask0;
581                 cMask[1] &= aMask1;
582                 cMask[2] &= aMask2;
583                 cMask[3] &= aMask3;
584         }
585
586         void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
587         {
588                 if(!state.depthWriteEnable)
589                 {
590                         return;
591                 }
592
593                 Float4 Z = z;
594
595                 if(shader && shader->depthOverride())
596                 {
597                         if(complementaryDepthBuffer)
598                         {
599                                 Z = Float4(1.0f) - oDepth;
600                         }
601                         else
602                         {
603                                 Z = oDepth;
604                         }
605                 }
606
607                 Pointer<Byte> buffer;
608                 Int pitch;
609
610                 if(!state.quadLayoutDepthBuffer)
611                 {
612                         buffer = zBuffer + 4 * x;
613                         pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
614                 }
615                 else
616                 {
617                         buffer = zBuffer + 8 * x;
618                 }
619
620                 if(q > 0)
621                 {
622                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
623                 }
624
625                 Float4 zValue;
626
627                 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
628                 {
629                         if(!state.quadLayoutDepthBuffer)
630                         {
631                                 // FIXME: Properly optimizes?
632                                 zValue.xy = *Pointer<Float4>(buffer);
633                                 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
634                         }
635                         else
636                         {
637                                 zValue = *Pointer<Float4>(buffer, 16);
638                         }
639                 }
640
641                 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
642                 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
643                 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
644
645                 if(!state.quadLayoutDepthBuffer)
646                 {
647                         // FIXME: Properly optimizes?
648                         *Pointer<Float2>(buffer) = Float2(Z.xy);
649                         *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
650                 }
651                 else
652                 {
653                         *Pointer<Float4>(buffer, 16) = Z;
654                 }
655         }
656
657         void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
658         {
659                 if(!state.stencilActive)
660                 {
661                         return;
662                 }
663
664                 if(state.stencilPassOperation == VK_STENCIL_OP_KEEP && state.stencilZFailOperation == VK_STENCIL_OP_KEEP && state.stencilFailOperation == VK_STENCIL_OP_KEEP)
665                 {
666                         if(!state.twoSidedStencil || (state.stencilPassOperationCCW == VK_STENCIL_OP_KEEP && state.stencilZFailOperationCCW == VK_STENCIL_OP_KEEP && state.stencilFailOperationCCW == VK_STENCIL_OP_KEEP))
667                         {
668                                 return;
669                         }
670                 }
671
672                 if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
673                 {
674                         return;
675                 }
676
677                 Pointer<Byte> buffer = sBuffer + 2 * x;
678
679                 if(q > 0)
680                 {
681                         buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
682                 }
683
684                 Byte8 bufferValue = *Pointer<Byte8>(buffer);
685
686                 Byte8 newValue;
687                 stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
688
689                 if(!state.noStencilWriteMask)
690                 {
691                         Byte8 maskedValue = bufferValue;
692                         newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
693                         maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
694                         newValue |= maskedValue;
695                 }
696
697                 if(state.twoSidedStencil)
698                 {
699                         Byte8 newValueCCW;
700
701                         stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
702
703                         if(!state.noStencilWriteMaskCCW)
704                         {
705                                 Byte8 maskedValue = bufferValue;
706                                 newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
707                                 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
708                                 newValueCCW |= maskedValue;
709                         }
710
711                         newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
712                         newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
713                         newValue |= newValueCCW;
714                 }
715
716                 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
717                 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
718                 newValue |= bufferValue;
719
720                 *Pointer<Byte4>(buffer) = Byte4(newValue);
721         }
722
723         void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, VkStencilOp stencilPassOperation, VkStencilOp stencilZFailOperation, VkStencilOp stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
724         {
725                 Byte8 &pass = newValue;
726                 Byte8 fail;
727                 Byte8 zFail;
728
729                 stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
730
731                 if(stencilZFailOperation != stencilPassOperation)
732                 {
733                         stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
734                 }
735
736                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
737                 {
738                         stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
739                 }
740
741                 if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
742                 {
743                         if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
744                         {
745                                 pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
746                                 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
747                                 pass |= zFail;
748                         }
749
750                         pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
751                         fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
752                         pass |= fail;
753                 }
754         }
755
756         void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, VkStencilOp operation, bool CCW)
757         {
758                 switch(operation)
759                 {
760                 case VK_STENCIL_OP_KEEP:
761                         output = bufferValue;
762                         break;
763                 case VK_STENCIL_OP_ZERO:
764                         output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
765                         break;
766                 case VK_STENCIL_OP_REPLACE:
767                         output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
768                         break;
769                 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
770                         output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
771                         break;
772                 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
773                         output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
774                         break;
775                 case VK_STENCIL_OP_INVERT:
776                         output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
777                         break;
778                 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
779                         output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
780                         break;
781                 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
782                         output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
783                         break;
784                 default:
785                         ASSERT(false);
786                 }
787         }
788
789         void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorActive)
790         {
791                 switch(blendFactorActive)
792                 {
793                 case VK_BLEND_FACTOR_ZERO:
794                         // Optimized
795                         break;
796                 case VK_BLEND_FACTOR_ONE:
797                         // Optimized
798                         break;
799                 case VK_BLEND_FACTOR_SRC_COLOR:
800                         blendFactor.x = current.x;
801                         blendFactor.y = current.y;
802                         blendFactor.z = current.z;
803                         break;
804                 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
805                         blendFactor.x = Short4(0xFFFFu) - current.x;
806                         blendFactor.y = Short4(0xFFFFu) - current.y;
807                         blendFactor.z = Short4(0xFFFFu) - current.z;
808                         break;
809                 case VK_BLEND_FACTOR_DST_COLOR:
810                         blendFactor.x = pixel.x;
811                         blendFactor.y = pixel.y;
812                         blendFactor.z = pixel.z;
813                         break;
814                 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
815                         blendFactor.x = Short4(0xFFFFu) - pixel.x;
816                         blendFactor.y = Short4(0xFFFFu) - pixel.y;
817                         blendFactor.z = Short4(0xFFFFu) - pixel.z;
818                         break;
819                 case VK_BLEND_FACTOR_SRC_ALPHA:
820                         blendFactor.x = current.w;
821                         blendFactor.y = current.w;
822                         blendFactor.z = current.w;
823                         break;
824                 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
825                         blendFactor.x = Short4(0xFFFFu) - current.w;
826                         blendFactor.y = Short4(0xFFFFu) - current.w;
827                         blendFactor.z = Short4(0xFFFFu) - current.w;
828                         break;
829                 case VK_BLEND_FACTOR_DST_ALPHA:
830                         blendFactor.x = pixel.w;
831                         blendFactor.y = pixel.w;
832                         blendFactor.z = pixel.w;
833                         break;
834                 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
835                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
836                         blendFactor.y = Short4(0xFFFFu) - pixel.w;
837                         blendFactor.z = Short4(0xFFFFu) - pixel.w;
838                         break;
839                 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
840                         blendFactor.x = Short4(0xFFFFu) - pixel.w;
841                         blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
842                         blendFactor.y = blendFactor.x;
843                         blendFactor.z = blendFactor.x;
844                         break;
845                 case VK_BLEND_FACTOR_CONSTANT_COLOR:
846                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
847                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
848                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
849                         break;
850                 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
851                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
852                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
853                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
854                         break;
855                 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
856                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
857                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
858                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
859                         break;
860                 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
861                         blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
862                         blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
863                         blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
864                         break;
865                 default:
866                         ASSERT(false);
867                 }
868         }
869
870         void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive)
871         {
872                 switch(blendFactorAlphaActive)
873                 {
874                 case VK_BLEND_FACTOR_ZERO:
875                         // Optimized
876                         break;
877                 case VK_BLEND_FACTOR_ONE:
878                         // Optimized
879                         break;
880                 case VK_BLEND_FACTOR_SRC_COLOR:
881                         blendFactor.w = current.w;
882                         break;
883                 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
884                         blendFactor.w = Short4(0xFFFFu) - current.w;
885                         break;
886                 case VK_BLEND_FACTOR_DST_COLOR:
887                         blendFactor.w = pixel.w;
888                         break;
889                 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
890                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
891                         break;
892                 case VK_BLEND_FACTOR_SRC_ALPHA:
893                         blendFactor.w = current.w;
894                         break;
895                 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
896                         blendFactor.w = Short4(0xFFFFu) - current.w;
897                         break;
898                 case VK_BLEND_FACTOR_DST_ALPHA:
899                         blendFactor.w = pixel.w;
900                         break;
901                 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
902                         blendFactor.w = Short4(0xFFFFu) - pixel.w;
903                         break;
904                 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
905                         blendFactor.w = Short4(0xFFFFu);
906                         break;
907                 case VK_BLEND_FACTOR_CONSTANT_COLOR:
908                 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
909                         blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
910                         break;
911                 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
912                 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
913                         blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
914                         break;
915                 default:
916                         ASSERT(false);
917                 }
918         }
919
920         bool PixelRoutine::isSRGB(int index) const
921         {
922                 return Surface::isSRGBformat(state.targetFormat[index]);
923         }
924
925         void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
926         {
927                 Short4 c01;
928                 Short4 c23;
929                 Pointer<Byte> buffer;
930                 Pointer<Byte> buffer2;
931
932                 switch(state.targetFormat[index])
933                 {
934                 case VK_FORMAT_R5G6B5_UNORM_PACK16:
935                         buffer = cBuffer + 2 * x;
936                         buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
937                         c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
938
939                         pixel.x = c01 & Short4(0xF800u);
940                         pixel.y = (c01 & Short4(0x07E0u)) << 5;
941                         pixel.z = (c01 & Short4(0x001Fu)) << 11;
942                         pixel.w = Short4(0xFFFFu);
943                         break;
944                 case VK_FORMAT_B8G8R8A8_UNORM:
945                         buffer = cBuffer + 4 * x;
946                         c01 = *Pointer<Short4>(buffer);
947                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
948                         c23 = *Pointer<Short4>(buffer);
949                         pixel.z = c01;
950                         pixel.y = c01;
951                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
952                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
953                         pixel.x = pixel.z;
954                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
955                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
956                         pixel.y = pixel.z;
957                         pixel.w = pixel.x;
958                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
959                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
960                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
961                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
962                         break;
963                 case VK_FORMAT_R8G8B8A8_UNORM:
964                 case VK_FORMAT_R8G8B8A8_SRGB:
965                         buffer = cBuffer + 4 * x;
966                         c01 = *Pointer<Short4>(buffer);
967                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
968                         c23 = *Pointer<Short4>(buffer);
969                         pixel.z = c01;
970                         pixel.y = c01;
971                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
972                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
973                         pixel.x = pixel.z;
974                         pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
975                         pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
976                         pixel.y = pixel.z;
977                         pixel.w = pixel.x;
978                         pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
979                         pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
980                         pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
981                         pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
982                         break;
983                 case VK_FORMAT_R8_UNORM:
984                         buffer = cBuffer + 1 * x;
985                         pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
986                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
987                         pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
988                         pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
989                         pixel.y = Short4(0x0000);
990                         pixel.z = Short4(0x0000);
991                         pixel.w = Short4(0xFFFFu);
992                         break;
993                 case VK_FORMAT_R8G8_UNORM:
994                         buffer = cBuffer + 2 * x;
995                         c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
996                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
997                         c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
998                         pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
999                         pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1000                         pixel.z = Short4(0x0000u);
1001                         pixel.w = Short4(0xFFFFu);
1002                         break;
1003                 case VK_FORMAT_R16G16B16A16_UNORM:
1004                         buffer = cBuffer;
1005                         pixel.x = *Pointer<Short4>(buffer + 8 * x);
1006                         pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1007                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1008                         pixel.z = *Pointer<Short4>(buffer + 8 * x);
1009                         pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1010                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1011                         break;
1012                 case VK_FORMAT_R16G16_UNORM:
1013                         buffer = cBuffer;
1014                         pixel.x = *Pointer<Short4>(buffer + 4 * x);
1015                         buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1016                         pixel.y = *Pointer<Short4>(buffer + 4 * x);
1017                         pixel.z = pixel.x;
1018                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1019                         pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1020                         pixel.y = pixel.z;
1021                         pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1022                         pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1023                         pixel.z = Short4(0xFFFFu);
1024                         pixel.w = Short4(0xFFFFu);
1025                         break;
1026                 default:
1027                         ASSERT(false);
1028                 }
1029
1030                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1031                 {
1032                         sRGBtoLinear16_12_16(pixel);
1033                 }
1034         }
1035
1036         void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1037         {
1038                 if(!state.alphaBlendActive)
1039                 {
1040                         return;
1041                 }
1042
1043                 Vector4s pixel;
1044                 readPixel(index, cBuffer, x, pixel);
1045
1046                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1047                 Vector4s sourceFactor;
1048                 Vector4s destFactor;
1049
1050                 blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1051                 blendFactor(destFactor, current, pixel, state.destBlendFactor);
1052
1053                 if(state.sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
1054                 {
1055                         current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1056                         current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1057                         current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1058                 }
1059
1060                 if(state.destBlendFactor != VK_BLEND_FACTOR_ONE && state.destBlendFactor != VK_BLEND_FACTOR_ZERO)
1061                 {
1062                         pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1063                         pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1064                         pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1065                 }
1066
1067                 switch(state.blendOperation)
1068                 {
1069                 case VK_BLEND_OP_ADD:
1070                         current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1071                         current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1072                         current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1073                         break;
1074                 case VK_BLEND_OP_SUBTRACT:
1075                         current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1076                         current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1077                         current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1078                         break;
1079                 case VK_BLEND_OP_REVERSE_SUBTRACT:
1080                         current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1081                         current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1082                         current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1083                         break;
1084                 case VK_BLEND_OP_MIN:
1085                         current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1086                         current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1087                         current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1088                         break;
1089                 case VK_BLEND_OP_MAX:
1090                         current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1091                         current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1092                         current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1093                         break;
1094                 case VK_BLEND_OP_SRC_EXT:
1095                         // No operation
1096                         break;
1097                 case VK_BLEND_OP_DST_EXT:
1098                         current.x = pixel.x;
1099                         current.y = pixel.y;
1100                         current.z = pixel.z;
1101                         break;
1102                 case VK_BLEND_OP_ZERO_EXT:
1103                         current.x = Short4(0x0000);
1104                         current.y = Short4(0x0000);
1105                         current.z = Short4(0x0000);
1106                         break;
1107                 default:
1108                         ASSERT(false);
1109                 }
1110
1111                 blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1112                 blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1113
1114                 if(state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1115                 {
1116                         current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1117                 }
1118
1119                 if(state.destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1120                 {
1121                         pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1122                 }
1123
1124                 switch(state.blendOperationAlpha)
1125                 {
1126                 case VK_BLEND_OP_ADD:
1127                         current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1128                         break;
1129                 case VK_BLEND_OP_SUBTRACT:
1130                         current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1131                         break;
1132                 case VK_BLEND_OP_REVERSE_SUBTRACT:
1133                         current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1134                         break;
1135                 case VK_BLEND_OP_MIN:
1136                         current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1137                         break;
1138                 case VK_BLEND_OP_MAX:
1139                         current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1140                         break;
1141                 case VK_BLEND_OP_SRC_EXT:
1142                         // No operation
1143                         break;
1144                 case VK_BLEND_OP_DST_EXT:
1145                         current.w = pixel.w;
1146                         break;
1147                 case VK_BLEND_OP_ZERO_EXT:
1148                         current.w = Short4(0x0000);
1149                         break;
1150                 default:
1151                         ASSERT(false);
1152                 }
1153         }
1154
1155         void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1156         {
1157                 if(state.logicalOperation == VK_LOGIC_OP_COPY)
1158                 {
1159                         return;
1160                 }
1161
1162                 Vector4s pixel;
1163                 readPixel(index, cBuffer, x, pixel);
1164
1165                 switch(state.logicalOperation)
1166                 {
1167                 case VK_LOGIC_OP_CLEAR:
1168                         current.x = UShort4(0);
1169                         current.y = UShort4(0);
1170                         current.z = UShort4(0);
1171                         break;
1172                 case VK_LOGIC_OP_SET:
1173                         current.x = UShort4(0xFFFFu);
1174                         current.y = UShort4(0xFFFFu);
1175                         current.z = UShort4(0xFFFFu);
1176                         break;
1177                 case VK_LOGIC_OP_COPY:
1178                         ASSERT(false);   // Optimized out
1179                         break;
1180                 case VK_LOGIC_OP_COPY_INVERTED:
1181                         current.x = ~current.x;
1182                         current.y = ~current.y;
1183                         current.z = ~current.z;
1184                         break;
1185                 case VK_LOGIC_OP_NO_OP:
1186                         current.x = pixel.x;
1187                         current.y = pixel.y;
1188                         current.z = pixel.z;
1189                         break;
1190                 case VK_LOGIC_OP_INVERT:
1191                         current.x = ~pixel.x;
1192                         current.y = ~pixel.y;
1193                         current.z = ~pixel.z;
1194                         break;
1195                 case VK_LOGIC_OP_AND:
1196                         current.x = pixel.x & current.x;
1197                         current.y = pixel.y & current.y;
1198                         current.z = pixel.z & current.z;
1199                         break;
1200                 case VK_LOGIC_OP_NAND:
1201                         current.x = ~(pixel.x & current.x);
1202                         current.y = ~(pixel.y & current.y);
1203                         current.z = ~(pixel.z & current.z);
1204                         break;
1205                 case VK_LOGIC_OP_OR:
1206                         current.x = pixel.x | current.x;
1207                         current.y = pixel.y | current.y;
1208                         current.z = pixel.z | current.z;
1209                         break;
1210                 case VK_LOGIC_OP_NOR:
1211                         current.x = ~(pixel.x | current.x);
1212                         current.y = ~(pixel.y | current.y);
1213                         current.z = ~(pixel.z | current.z);
1214                         break;
1215                 case VK_LOGIC_OP_XOR:
1216                         current.x = pixel.x ^ current.x;
1217                         current.y = pixel.y ^ current.y;
1218                         current.z = pixel.z ^ current.z;
1219                         break;
1220                 case VK_LOGIC_OP_EQUIVALENT:
1221                         current.x = ~(pixel.x ^ current.x);
1222                         current.y = ~(pixel.y ^ current.y);
1223                         current.z = ~(pixel.z ^ current.z);
1224                         break;
1225                 case VK_LOGIC_OP_AND_REVERSE:
1226                         current.x = ~pixel.x & current.x;
1227                         current.y = ~pixel.y & current.y;
1228                         current.z = ~pixel.z & current.z;
1229                         break;
1230                 case VK_LOGIC_OP_AND_INVERTED:
1231                         current.x = pixel.x & ~current.x;
1232                         current.y = pixel.y & ~current.y;
1233                         current.z = pixel.z & ~current.z;
1234                         break;
1235                 case VK_LOGIC_OP_OR_REVERSE:
1236                         current.x = ~pixel.x | current.x;
1237                         current.y = ~pixel.y | current.y;
1238                         current.z = ~pixel.z | current.z;
1239                         break;
1240                 case VK_LOGIC_OP_OR_INVERTED:
1241                         current.x = pixel.x | ~current.x;
1242                         current.y = pixel.y | ~current.y;
1243                         current.z = pixel.z | ~current.z;
1244                         break;
1245                 default:
1246                         ASSERT(false);
1247                 }
1248         }
1249
1250         void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1251         {
1252                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1253                 {
1254                         linearToSRGB16_12_16(current);
1255                 }
1256
1257                 if(exactColorRounding)
1258                 {
1259                         switch(state.targetFormat[index])
1260                         {
1261                         case VK_FORMAT_R5G6B5_UNORM_PACK16:
1262                                 current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1263                                 current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1264                                 current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1265                                 break;
1266                         case VK_FORMAT_B8G8R8A8_UNORM:
1267                         case VK_FORMAT_R8G8B8A8_UNORM:
1268                         case VK_FORMAT_R8G8B8A8_SRGB:
1269                         case VK_FORMAT_R8G8_UNORM:
1270                         case VK_FORMAT_R8_UNORM:
1271                                 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1272                                 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1273                                 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1274                                 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1275                                 break;
1276                         default:
1277                                 break;
1278                         }
1279                 }
1280
1281                 int rgbaWriteMask = state.colorWriteActive(index);
1282                 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1283
1284                 switch(state.targetFormat[index])
1285                 {
1286                 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1287                         {
1288                                 current.x = current.x & Short4(0xF800u);
1289                                 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1290                                 current.z = As<UShort4>(current.z) >> 11;
1291
1292                                 current.x = current.x | current.y | current.z;
1293                         }
1294                         break;
1295                 case VK_FORMAT_B8G8R8A8_UNORM:
1296                         if(rgbaWriteMask == 0x7)
1297                         {
1298                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1299                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1300                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1301
1302                                 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1303                                 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1304
1305                                 current.x = current.z;
1306                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1307                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1308                                 current.y = current.z;
1309                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1310                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1311                         }
1312                         else
1313                         {
1314                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1315                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1316                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1317                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1318
1319                                 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1320                                 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1321
1322                                 current.x = current.z;
1323                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1324                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1325                                 current.y = current.z;
1326                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1327                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1328                         }
1329                         break;
1330                 case VK_FORMAT_R8G8B8A8_UNORM:
1331                 case VK_FORMAT_R8G8B8A8_SRGB:
1332                         if(rgbaWriteMask == 0x7)
1333                         {
1334                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1335                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1336                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1337
1338                                 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1339                                 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1340
1341                                 current.x = current.z;
1342                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1343                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1344                                 current.y = current.z;
1345                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1346                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1347                         }
1348                         else
1349                         {
1350                                 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1351                                 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1352                                 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1353                                 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1354
1355                                 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1356                                 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1357
1358                                 current.x = current.z;
1359                                 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1360                                 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1361                                 current.y = current.z;
1362                                 current.z = As<Short4>(UnpackLow(current.z, current.x));
1363                                 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1364                         }
1365                         break;
1366                 case VK_FORMAT_R8G8_UNORM:
1367                         current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1368                         current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1369                         current.x = As<Short4>(PackUnsigned(current.x, current.x));
1370                         current.y = As<Short4>(PackUnsigned(current.y, current.y));
1371                         current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1372                         break;
1373                 case VK_FORMAT_R8_UNORM:
1374                         current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1375                         current.x = As<Short4>(PackUnsigned(current.x, current.x));
1376                         break;
1377                 case VK_FORMAT_R16G16_UNORM:
1378                         current.z = current.x;
1379                         current.x = As<Short4>(UnpackLow(current.x, current.y));
1380                         current.z = As<Short4>(UnpackHigh(current.z, current.y));
1381                         current.y = current.z;
1382                         break;
1383                 case VK_FORMAT_R16G16B16A16_UNORM:
1384                         transpose4x4(current.x, current.y, current.z, current.w);
1385                         break;
1386                 default:
1387                         ASSERT(false);
1388                 }
1389
1390                 Short4 c01 = current.z;
1391                 Short4 c23 = current.y;
1392
1393                 Int xMask;   // Combination of all masks
1394
1395                 if(state.depthTestActive)
1396                 {
1397                         xMask = zMask;
1398                 }
1399                 else
1400                 {
1401                         xMask = cMask;
1402                 }
1403
1404                 if(state.stencilActive)
1405                 {
1406                         xMask &= sMask;
1407                 }
1408
1409                 switch(state.targetFormat[index])
1410                 {
1411                 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1412                         {
1413                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1414                                 Int value = *Pointer<Int>(buffer);
1415
1416                                 Int c01 = Extract(As<Int2>(current.x), 0);
1417
1418                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1419                                 {
1420                                         Int masked = value;
1421                                         c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1422                                         masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1423                                         c01 |= masked;
1424                                 }
1425
1426                                 c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1427                                 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1428                                 c01 |= value;
1429                                 *Pointer<Int>(buffer) = c01;
1430
1431                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1432                                 value = *Pointer<Int>(buffer);
1433
1434                                 Int c23 = Extract(As<Int2>(current.x), 1);
1435
1436                                 if((bgraWriteMask & 0x00000007) != 0x00000007)
1437                                 {
1438                                         Int masked = value;
1439                                         c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1440                                         masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1441                                         c23 |= masked;
1442                                 }
1443
1444                                 c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1445                                 value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1446                                 c23 |= value;
1447                                 *Pointer<Int>(buffer) = c23;
1448                         }
1449                         break;
1450                 case VK_FORMAT_B8G8R8A8_UNORM:
1451                         {
1452                                 Pointer<Byte> buffer = cBuffer + x * 4;
1453                                 Short4 value = *Pointer<Short4>(buffer);
1454
1455                                 if(state.targetFormat[index] == VK_FORMAT_B8G8R8A8_UNORM && bgraWriteMask != 0x0000000F)   // FIXME: Need for masking when XRGB && Fh?
1456                                 {
1457                                         Short4 masked = value;
1458                                         c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1459                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1460                                         c01 |= masked;
1461                                 }
1462
1463                                 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1464                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1465                                 c01 |= value;
1466                                 *Pointer<Short4>(buffer) = c01;
1467
1468                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1469                                 value = *Pointer<Short4>(buffer);
1470
1471                                 if(state.targetFormat[index] == VK_FORMAT_B8G8R8A8_UNORM && bgraWriteMask != 0x0000000F)   // FIXME: Need for masking when XRGB && Fh?
1472                                 {
1473                                         Short4 masked = value;
1474                                         c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1475                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1476                                         c23 |= masked;
1477                                 }
1478
1479                                 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1480                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1481                                 c23 |= value;
1482                                 *Pointer<Short4>(buffer) = c23;
1483                         }
1484                         break;
1485                 case VK_FORMAT_R8G8B8A8_UNORM:
1486                 case VK_FORMAT_R8G8B8A8_SRGB:
1487                         {
1488                                 Pointer<Byte> buffer = cBuffer + x * 4;
1489                                 Short4 value = *Pointer<Short4>(buffer);
1490
1491                                 bool masked = ((state.targetFormat[index] == VK_FORMAT_R8G8B8A8_UNORM || state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SRGB) && rgbaWriteMask != 0x0000000F); // FIXME: Need for masking when XBGR && Fh?
1492
1493                                 if(masked)
1494                                 {
1495                                         Short4 masked = value;
1496                                         c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1497                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1498                                         c01 |= masked;
1499                                 }
1500
1501                                 c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1502                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1503                                 c01 |= value;
1504                                 *Pointer<Short4>(buffer) = c01;
1505
1506                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1507                                 value = *Pointer<Short4>(buffer);
1508
1509                                 if(masked)
1510                                 {
1511                                         Short4 masked = value;
1512                                         c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1513                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1514                                         c23 |= masked;
1515                                 }
1516
1517                                 c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1518                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1519                                 c23 |= value;
1520                                 *Pointer<Short4>(buffer) = c23;
1521                         }
1522                         break;
1523                 case VK_FORMAT_R8G8_UNORM:
1524                         if((rgbaWriteMask & 0x00000003) != 0x0)
1525                         {
1526                                 Pointer<Byte> buffer = cBuffer + 2 * x;
1527                                 Int2 value;
1528                                 value = Insert(value, *Pointer<Int>(buffer), 0);
1529                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1530                                 value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1531
1532                                 Int2 packedCol = As<Int2>(current.x);
1533
1534                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1535                                 if((rgbaWriteMask & 0x3) != 0x3)
1536                                 {
1537                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1538                                         UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1539                                         mergedMask &= rgbaMask;
1540                                 }
1541
1542                                 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1543
1544                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1545                                 *Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1546                         }
1547                         break;
1548                 case VK_FORMAT_R8_UNORM:
1549                         if(rgbaWriteMask & 0x00000001)
1550                         {
1551                                 Pointer<Byte> buffer = cBuffer + 1 * x;
1552                                 Short4 value;
1553                                 value = Insert(value, *Pointer<Short>(buffer), 0);
1554                                 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1555                                 value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1556
1557                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1558                                 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1559                                 current.x |= value;
1560
1561                                 *Pointer<Short>(buffer) = Extract(current.x, 0);
1562                                 *Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1563                         }
1564                         break;
1565                 case VK_FORMAT_R16G16_UNORM:
1566                         {
1567                                 Pointer<Byte> buffer = cBuffer + 4 * x;
1568
1569                                 Short4 value = *Pointer<Short4>(buffer);
1570
1571                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1572                                 {
1573                                         Short4 masked = value;
1574                                         current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1575                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1576                                         current.x |= masked;
1577                                 }
1578
1579                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1580                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1581                                 current.x |= value;
1582                                 *Pointer<Short4>(buffer) = current.x;
1583
1584                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1585
1586                                 value = *Pointer<Short4>(buffer);
1587
1588                                 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1589                                 {
1590                                         Short4 masked = value;
1591                                         current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1592                                         masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1593                                         current.y |= masked;
1594                                 }
1595
1596                                 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1597                                 value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1598                                 current.y |= value;
1599                                 *Pointer<Short4>(buffer) = current.y;
1600                         }
1601                         break;
1602                 case VK_FORMAT_R16G16B16A16_UNORM:
1603                         {
1604                                 Pointer<Byte> buffer = cBuffer + 8 * x;
1605
1606                                 {
1607                                         Short4 value = *Pointer<Short4>(buffer);
1608
1609                                         if(rgbaWriteMask != 0x0000000F)
1610                                         {
1611                                                 Short4 masked = value;
1612                                                 current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1613                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1614                                                 current.x |= masked;
1615                                         }
1616
1617                                         current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1618                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1619                                         current.x |= value;
1620                                         *Pointer<Short4>(buffer) = current.x;
1621                                 }
1622
1623                                 {
1624                                         Short4 value = *Pointer<Short4>(buffer + 8);
1625
1626                                         if(rgbaWriteMask != 0x0000000F)
1627                                         {
1628                                                 Short4 masked = value;
1629                                                 current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1630                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1631                                                 current.y |= masked;
1632                                         }
1633
1634                                         current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1635                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1636                                         current.y |= value;
1637                                         *Pointer<Short4>(buffer + 8) = current.y;
1638                                 }
1639
1640                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1641
1642                                 {
1643                                         Short4 value = *Pointer<Short4>(buffer);
1644
1645                                         if(rgbaWriteMask != 0x0000000F)
1646                                         {
1647                                                 Short4 masked = value;
1648                                                 current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1649                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1650                                                 current.z |= masked;
1651                                         }
1652
1653                                         current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1654                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1655                                         current.z |= value;
1656                                         *Pointer<Short4>(buffer) = current.z;
1657                                 }
1658
1659                                 {
1660                                         Short4 value = *Pointer<Short4>(buffer + 8);
1661
1662                                         if(rgbaWriteMask != 0x0000000F)
1663                                         {
1664                                                 Short4 masked = value;
1665                                                 current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1666                                                 masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1667                                                 current.w |= masked;
1668                                         }
1669
1670                                         current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1671                                         value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1672                                         current.w |= value;
1673                                         *Pointer<Short4>(buffer + 8) = current.w;
1674                                 }
1675                         }
1676                         break;
1677                 default:
1678                         ASSERT(false);
1679                 }
1680         }
1681
1682         void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive)
1683         {
1684                 switch(blendFactorActive)
1685                 {
1686                 case VK_BLEND_FACTOR_ZERO:
1687                         // Optimized
1688                         break;
1689                 case VK_BLEND_FACTOR_ONE:
1690                         // Optimized
1691                         break;
1692                 case VK_BLEND_FACTOR_SRC_COLOR:
1693                         blendFactor.x = oC.x;
1694                         blendFactor.y = oC.y;
1695                         blendFactor.z = oC.z;
1696                         break;
1697                 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1698                         blendFactor.x = Float4(1.0f) - oC.x;
1699                         blendFactor.y = Float4(1.0f) - oC.y;
1700                         blendFactor.z = Float4(1.0f) - oC.z;
1701                         break;
1702                 case VK_BLEND_FACTOR_DST_COLOR:
1703                         blendFactor.x = pixel.x;
1704                         blendFactor.y = pixel.y;
1705                         blendFactor.z = pixel.z;
1706                         break;
1707                 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1708                         blendFactor.x = Float4(1.0f) - pixel.x;
1709                         blendFactor.y = Float4(1.0f) - pixel.y;
1710                         blendFactor.z = Float4(1.0f) - pixel.z;
1711                         break;
1712                 case VK_BLEND_FACTOR_SRC_ALPHA:
1713                         blendFactor.x = oC.w;
1714                         blendFactor.y = oC.w;
1715                         blendFactor.z = oC.w;
1716                         break;
1717                 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1718                         blendFactor.x = Float4(1.0f) - oC.w;
1719                         blendFactor.y = Float4(1.0f) - oC.w;
1720                         blendFactor.z = Float4(1.0f) - oC.w;
1721                         break;
1722                 case VK_BLEND_FACTOR_DST_ALPHA:
1723                         blendFactor.x = pixel.w;
1724                         blendFactor.y = pixel.w;
1725                         blendFactor.z = pixel.w;
1726                         break;
1727                 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1728                         blendFactor.x = Float4(1.0f) - pixel.w;
1729                         blendFactor.y = Float4(1.0f) - pixel.w;
1730                         blendFactor.z = Float4(1.0f) - pixel.w;
1731                         break;
1732                 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1733                         blendFactor.x = Float4(1.0f) - pixel.w;
1734                         blendFactor.x = Min(blendFactor.x, oC.w);
1735                         blendFactor.y = blendFactor.x;
1736                         blendFactor.z = blendFactor.x;
1737                         break;
1738                 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1739                         blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1740                         blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1741                         blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1742                         break;
1743                 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1744                         blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1745                         blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1746                         blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1747                         break;
1748                 default:
1749                         ASSERT(false);
1750                 }
1751         }
1752
1753         void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive)
1754         {
1755                 switch(blendFactorAlphaActive)
1756                 {
1757                 case VK_BLEND_FACTOR_ZERO:
1758                         // Optimized
1759                         break;
1760                 case VK_BLEND_FACTOR_ONE:
1761                         // Optimized
1762                         break;
1763                 case VK_BLEND_FACTOR_SRC_COLOR:
1764                         blendFactor.w = oC.w;
1765                         break;
1766                 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1767                         blendFactor.w = Float4(1.0f) - oC.w;
1768                         break;
1769                 case VK_BLEND_FACTOR_DST_COLOR:
1770                         blendFactor.w = pixel.w;
1771                         break;
1772                 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1773                         blendFactor.w = Float4(1.0f) - pixel.w;
1774                         break;
1775                 case VK_BLEND_FACTOR_SRC_ALPHA:
1776                         blendFactor.w = oC.w;
1777                         break;
1778                 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1779                         blendFactor.w = Float4(1.0f) - oC.w;
1780                         break;
1781                 case VK_BLEND_FACTOR_DST_ALPHA:
1782                         blendFactor.w = pixel.w;
1783                         break;
1784                 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1785                         blendFactor.w = Float4(1.0f) - pixel.w;
1786                         break;
1787                 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1788                         blendFactor.w = Float4(1.0f);
1789                         break;
1790                 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1791                         blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
1792                         break;
1793                 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1794                         blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
1795                         break;
1796                 default:
1797                         ASSERT(false);
1798                 }
1799         }
1800
1801         void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
1802         {
1803                 if(!state.alphaBlendActive)
1804                 {
1805                         return;
1806                 }
1807
1808                 Pointer<Byte> buffer;
1809                 Vector4f pixel;
1810
1811                 Vector4s color;
1812                 Short4 c01;
1813                 Short4 c23;
1814
1815                 Float4 one;
1816                 if(Surface::isFloatFormat(state.targetFormat[index]))
1817                 {
1818                         one = Float4(1.0f);
1819                 }
1820                 else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
1821                 {
1822                         one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
1823                 }
1824
1825                 switch(state.targetFormat[index])
1826                 {
1827                 case VK_FORMAT_R32_SINT:
1828                 case VK_FORMAT_R32_UINT:
1829                 case VK_FORMAT_R32_SFLOAT:
1830                         buffer = cBuffer;
1831                         // FIXME: movlps
1832                         pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
1833                         pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
1834                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1835                         // FIXME: movhps
1836                         pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
1837                         pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
1838                         pixel.y = pixel.z = pixel.w = one;
1839                         break;
1840                 case VK_FORMAT_R32G32_SINT:
1841                 case VK_FORMAT_R32G32_UINT:
1842                 case VK_FORMAT_R32G32_SFLOAT:
1843                         buffer = cBuffer;
1844                         pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
1845                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1846                         pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
1847                         pixel.z = pixel.x;
1848                         pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
1849                         pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
1850                         pixel.y = pixel.z;
1851                         pixel.z = pixel.w = one;
1852                         break;
1853                 case VK_FORMAT_R32G32B32A32_SFLOAT:
1854                 case VK_FORMAT_R32G32B32A32_SINT:
1855                 case VK_FORMAT_R32G32B32A32_UINT:
1856                         buffer = cBuffer;
1857                         pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
1858                         pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
1859                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1860                         pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
1861                         pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
1862                         transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1863                         break;
1864                 default:
1865                         ASSERT(false);
1866                 }
1867
1868                 if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1869                 {
1870                         sRGBtoLinear(pixel.x);
1871                         sRGBtoLinear(pixel.y);
1872                         sRGBtoLinear(pixel.z);
1873                 }
1874
1875                 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1876                 Vector4f sourceFactor;
1877                 Vector4f destFactor;
1878
1879                 blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
1880                 blendFactor(destFactor, oC, pixel, state.destBlendFactor);
1881
1882                 if(state.sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
1883                 {
1884                         oC.x *= sourceFactor.x;
1885                         oC.y *= sourceFactor.y;
1886                         oC.z *= sourceFactor.z;
1887                 }
1888
1889                 if(state.destBlendFactor != VK_BLEND_FACTOR_ONE && state.destBlendFactor != VK_BLEND_FACTOR_ZERO)
1890                 {
1891                         pixel.x *= destFactor.x;
1892                         pixel.y *= destFactor.y;
1893                         pixel.z *= destFactor.z;
1894                 }
1895
1896                 switch(state.blendOperation)
1897                 {
1898                 case VK_BLEND_OP_ADD:
1899                         oC.x += pixel.x;
1900                         oC.y += pixel.y;
1901                         oC.z += pixel.z;
1902                         break;
1903                 case VK_BLEND_OP_SUBTRACT:
1904                         oC.x -= pixel.x;
1905                         oC.y -= pixel.y;
1906                         oC.z -= pixel.z;
1907                         break;
1908                 case VK_BLEND_OP_REVERSE_SUBTRACT:
1909                         oC.x = pixel.x - oC.x;
1910                         oC.y = pixel.y - oC.y;
1911                         oC.z = pixel.z - oC.z;
1912                         break;
1913                 case VK_BLEND_OP_MIN:
1914                         oC.x = Min(oC.x, pixel.x);
1915                         oC.y = Min(oC.y, pixel.y);
1916                         oC.z = Min(oC.z, pixel.z);
1917                         break;
1918                 case VK_BLEND_OP_MAX:
1919                         oC.x = Max(oC.x, pixel.x);
1920                         oC.y = Max(oC.y, pixel.y);
1921                         oC.z = Max(oC.z, pixel.z);
1922                         break;
1923                 case VK_BLEND_OP_SRC_EXT:
1924                         // No operation
1925                         break;
1926                 case VK_BLEND_OP_DST_EXT:
1927                         oC.x = pixel.x;
1928                         oC.y = pixel.y;
1929                         oC.z = pixel.z;
1930                         break;
1931                 case VK_BLEND_OP_ZERO_EXT:
1932                         oC.x = Float4(0.0f);
1933                         oC.y = Float4(0.0f);
1934                         oC.z = Float4(0.0f);
1935                         break;
1936                 default:
1937                         ASSERT(false);
1938                 }
1939
1940                 blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
1941                 blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
1942
1943                 if(state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1944                 {
1945                         oC.w *= sourceFactor.w;
1946                 }
1947
1948                 if(state.destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1949                 {
1950                         pixel.w *= destFactor.w;
1951                 }
1952
1953                 switch(state.blendOperationAlpha)
1954                 {
1955                 case VK_BLEND_OP_ADD:
1956                         oC.w += pixel.w;
1957                         break;
1958                 case VK_BLEND_OP_SUBTRACT:
1959                         oC.w -= pixel.w;
1960                         break;
1961                 case VK_BLEND_OP_REVERSE_SUBTRACT:
1962                         pixel.w -= oC.w;
1963                         oC.w = pixel.w;
1964                         break;
1965                 case VK_BLEND_OP_MIN:
1966                         oC.w = Min(oC.w, pixel.w);
1967                         break;
1968                 case VK_BLEND_OP_MAX:
1969                         oC.w = Max(oC.w, pixel.w);
1970                         break;
1971                 case VK_BLEND_OP_SRC_EXT:
1972                         // No operation
1973                         break;
1974                 case VK_BLEND_OP_DST_EXT:
1975                         oC.w = pixel.w;
1976                         break;
1977                 case VK_BLEND_OP_ZERO_EXT:
1978                         oC.w = Float4(0.0f);
1979                         break;
1980                 default:
1981                         ASSERT(false);
1982                 }
1983         }
1984
1985         void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
1986         {
1987                 switch(state.targetFormat[index])
1988                 {
1989                 case VK_FORMAT_R32_SFLOAT:
1990                 case VK_FORMAT_R32_SINT:
1991                 case VK_FORMAT_R32_UINT:
1992                 case VK_FORMAT_R16_SINT:
1993                 case VK_FORMAT_R16_UINT:
1994                 case VK_FORMAT_R8_SINT:
1995                 case VK_FORMAT_R8_UINT:
1996                         break;
1997                 case VK_FORMAT_R32G32_SFLOAT:
1998                 case VK_FORMAT_R32G32_SINT:
1999                 case VK_FORMAT_R32G32_UINT:
2000                 case VK_FORMAT_R16G16_SINT:
2001                 case VK_FORMAT_R16G16_UINT:
2002                 case VK_FORMAT_R8G8_SINT:
2003                 case VK_FORMAT_R8G8_UINT:
2004                         oC.z = oC.x;
2005                         oC.x = UnpackLow(oC.x, oC.y);
2006                         oC.z = UnpackHigh(oC.z, oC.y);
2007                         oC.y = oC.z;
2008                         break;
2009                 case VK_FORMAT_R32G32B32A32_SFLOAT:
2010                 case VK_FORMAT_R32G32B32A32_SINT:
2011                 case VK_FORMAT_R32G32B32A32_UINT:
2012                 case VK_FORMAT_R16G16B16A16_SINT:
2013                 case VK_FORMAT_R16G16B16A16_UINT:
2014                 case VK_FORMAT_R8G8B8A8_SINT:
2015                 case VK_FORMAT_R8G8B8A8_UINT:
2016                         transpose4x4(oC.x, oC.y, oC.z, oC.w);
2017                         break;
2018                 default:
2019                         ASSERT(false);
2020                 }
2021
2022                 int rgbaWriteMask = state.colorWriteActive(index);
2023
2024                 Int xMask;   // Combination of all masks
2025
2026                 if(state.depthTestActive)
2027                 {
2028                         xMask = zMask;
2029                 }
2030                 else
2031                 {
2032                         xMask = cMask;
2033                 }
2034
2035                 if(state.stencilActive)
2036                 {
2037                         xMask &= sMask;
2038                 }
2039
2040                 Pointer<Byte> buffer;
2041                 Float4 value;
2042
2043                 switch(state.targetFormat[index])
2044                 {
2045                 case VK_FORMAT_R32_SFLOAT:
2046                 case VK_FORMAT_R32_SINT:
2047                 case VK_FORMAT_R32_UINT:
2048                         if(rgbaWriteMask & 0x00000001)
2049                         {
2050                                 buffer = cBuffer + 4 * x;
2051
2052                                 // FIXME: movlps
2053                                 value.x = *Pointer<Float>(buffer + 0);
2054                                 value.y = *Pointer<Float>(buffer + 4);
2055
2056                                 buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2057
2058                                 // FIXME: movhps
2059                                 value.z = *Pointer<Float>(buffer + 0);
2060                                 value.w = *Pointer<Float>(buffer + 4);
2061
2062                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2063                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2064                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2065
2066                                 // FIXME: movhps
2067                                 *Pointer<Float>(buffer + 0) = oC.x.z;
2068                                 *Pointer<Float>(buffer + 4) = oC.x.w;
2069
2070                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2071
2072                                 // FIXME: movlps
2073                                 *Pointer<Float>(buffer + 0) = oC.x.x;
2074                                 *Pointer<Float>(buffer + 4) = oC.x.y;
2075                         }
2076                         break;
2077                 case VK_FORMAT_R16_SINT:
2078                 case VK_FORMAT_R16_UINT:
2079                         if(rgbaWriteMask & 0x00000001)
2080                         {
2081                                 buffer = cBuffer + 2 * x;
2082
2083                                 UShort4 xyzw;
2084                                 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2085
2086                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2087
2088                                 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2089                                 value = As<Float4>(Int4(xyzw));
2090
2091                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2092                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2093                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2094
2095                                 if(state.targetFormat[index] == VK_FORMAT_R16_SINT)
2096                                 {
2097                                         Float component = oC.x.z;
2098                                         *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2099                                         component = oC.x.w;
2100                                         *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2101
2102                                         buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2103
2104                                         component = oC.x.x;
2105                                         *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2106                                         component = oC.x.y;
2107                                         *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2108                                 }
2109                                 else // VK_FORMAT_R16_UINT
2110                                 {
2111                                         Float component = oC.x.z;
2112                                         *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2113                                         component = oC.x.w;
2114                                         *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2115
2116                                         buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2117
2118                                         component = oC.x.x;
2119                                         *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2120                                         component = oC.x.y;
2121                                         *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2122                                 }
2123                         }
2124                         break;
2125                 case VK_FORMAT_R8_SINT:
2126                 case VK_FORMAT_R8_UINT:
2127                         if(rgbaWriteMask & 0x00000001)
2128                         {
2129                                 buffer = cBuffer + x;
2130
2131                                 UInt xyzw, packedCol;
2132
2133                                 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2134                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2135                                 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2136
2137                                 Short4 tmpCol = Short4(As<Int4>(oC.x));
2138                                 if(state.targetFormat[index] == VK_FORMAT_R8_SINT)
2139                                 {
2140                                         tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2141                                 }
2142                                 else
2143                                 {
2144                                         tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2145                                 }
2146                                 packedCol = Extract(As<Int2>(tmpCol), 0);
2147
2148                                 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2149                                             (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2150
2151                                 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2152                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2153                                 *Pointer<UShort>(buffer) = UShort(packedCol);
2154                         }
2155                         break;
2156                 case VK_FORMAT_R32G32_SFLOAT:
2157                 case VK_FORMAT_R32G32_SINT:
2158                 case VK_FORMAT_R32G32_UINT:
2159                         buffer = cBuffer + 8 * x;
2160
2161                         value = *Pointer<Float4>(buffer);
2162
2163                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2164                         {
2165                                 Float4 masked = value;
2166                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2167                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2168                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2169                         }
2170
2171                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2172                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2173                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2174                         *Pointer<Float4>(buffer) = oC.x;
2175
2176                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2177
2178                         value = *Pointer<Float4>(buffer);
2179
2180                         if((rgbaWriteMask & 0x00000003) != 0x00000003)
2181                         {
2182                                 Float4 masked;
2183
2184                                 masked = value;
2185                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2186                                 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2187                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2188                         }
2189
2190                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2191                         value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2192                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2193                         *Pointer<Float4>(buffer) = oC.y;
2194                         break;
2195                 case VK_FORMAT_R16G16_SINT:
2196                 case VK_FORMAT_R16G16_UINT:
2197                         if((rgbaWriteMask & 0x00000003) != 0x0)
2198                         {
2199                                 buffer = cBuffer + 4 * x;
2200
2201                                 UInt2 rgbaMask;
2202                                 UShort4 packedCol = UShort4(As<Int4>(oC.x));
2203                                 UShort4 value = *Pointer<UShort4>(buffer);
2204                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2205                                 if((rgbaWriteMask & 0x3) != 0x3)
2206                                 {
2207                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2208                                         rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2209                                         mergedMask &= rgbaMask;
2210                                 }
2211                                 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2212
2213                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2214
2215                                 packedCol = UShort4(As<Int4>(oC.y));
2216                                 value = *Pointer<UShort4>(buffer);
2217                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2218                                 if((rgbaWriteMask & 0x3) != 0x3)
2219                                 {
2220                                         mergedMask &= rgbaMask;
2221                                 }
2222                                 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2223                         }
2224                         break;
2225                 case VK_FORMAT_R8G8_SINT:
2226                 case VK_FORMAT_R8G8_UINT:
2227                         if((rgbaWriteMask & 0x00000003) != 0x0)
2228                         {
2229                                 buffer = cBuffer + 2 * x;
2230
2231                                 Int2 xyzw, packedCol;
2232
2233                                 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2234                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2235                                 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2236
2237                                 if(state.targetFormat[index] == VK_FORMAT_R8G8_SINT)
2238                                 {
2239                                         packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2240                                 }
2241                                 else
2242                                 {
2243                                         packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2244                                 }
2245
2246                                 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2247                                 if((rgbaWriteMask & 0x3) != 0x3)
2248                                 {
2249                                         Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2250                                         UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2251                                         mergedMask &= rgbaMask;
2252                                 }
2253
2254                                 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2255
2256                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2257                                 buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2258                                 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2259                         }
2260                         break;
2261                 case VK_FORMAT_R32G32B32A32_SFLOAT:
2262                 case VK_FORMAT_R32G32B32A32_SINT:
2263                 case VK_FORMAT_R32G32B32A32_UINT:
2264                         buffer = cBuffer + 16 * x;
2265
2266                         {
2267                                 value = *Pointer<Float4>(buffer, 16);
2268
2269                                 if(rgbaWriteMask != 0x0000000F)
2270                                 {
2271                                         Float4 masked = value;
2272                                         oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2273                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2274                                         oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2275                                 }
2276
2277                                 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2278                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2279                                 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2280                                 *Pointer<Float4>(buffer, 16) = oC.x;
2281                         }
2282
2283                         {
2284                                 value = *Pointer<Float4>(buffer + 16, 16);
2285
2286                                 if(rgbaWriteMask != 0x0000000F)
2287                                 {
2288                                         Float4 masked = value;
2289                                         oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2290                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2291                                         oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2292                                 }
2293
2294                                 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2295                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2296                                 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2297                                 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2298                         }
2299
2300                         buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2301
2302                         {
2303                                 value = *Pointer<Float4>(buffer, 16);
2304
2305                                 if(rgbaWriteMask != 0x0000000F)
2306                                 {
2307                                         Float4 masked = value;
2308                                         oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2309                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2310                                         oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2311                                 }
2312
2313                                 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2314                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2315                                 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2316                                 *Pointer<Float4>(buffer, 16) = oC.z;
2317                         }
2318
2319                         {
2320                                 value = *Pointer<Float4>(buffer + 16, 16);
2321
2322                                 if(rgbaWriteMask != 0x0000000F)
2323                                 {
2324                                         Float4 masked = value;
2325                                         oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2326                                         masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2327                                         oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2328                                 }
2329
2330                                 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2331                                 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2332                                 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2333                                 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2334                         }
2335                         break;
2336                 case VK_FORMAT_R16G16B16A16_SINT:
2337                 case VK_FORMAT_R16G16B16A16_UINT:
2338                         if((rgbaWriteMask & 0x0000000F) != 0x0)
2339                         {
2340                                 buffer = cBuffer + 8 * x;
2341
2342                                 UInt4 rgbaMask;
2343                                 UShort8 value = *Pointer<UShort8>(buffer);
2344                                 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2345                                 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2346                                 if((rgbaWriteMask & 0xF) != 0xF)
2347                                 {
2348                                         UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2349                                         rgbaMask = UInt4(tmpMask, tmpMask);
2350                                         mergedMask &= rgbaMask;
2351                                 }
2352                                 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2353
2354                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2355
2356                                 value = *Pointer<UShort8>(buffer);
2357                                 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2358                                 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2359                                 if((rgbaWriteMask & 0xF) != 0xF)
2360                                 {
2361                                         mergedMask &= rgbaMask;
2362                                 }
2363                                 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2364                         }
2365                         break;
2366                 case VK_FORMAT_R8G8B8A8_SINT:
2367                 case VK_FORMAT_R8G8B8A8_UINT:
2368                         if((rgbaWriteMask & 0x0000000F) != 0x0)
2369                         {
2370                                 UInt2 value, packedCol, mergedMask;
2371
2372                                 buffer = cBuffer + 4 * x;
2373
2374                                 if(state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SINT)
2375                                 {
2376                                         packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2377                                 }
2378                                 else
2379                                 {
2380                                         packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2381                                 }
2382                                 value = *Pointer<UInt2>(buffer, 16);
2383                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2384                                 if(rgbaWriteMask != 0xF)
2385                                 {
2386                                         mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2387                                 }
2388                                 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2389
2390                                 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2391
2392                                 if(state.targetFormat[index] == VK_FORMAT_R8G8B8A8_SINT)
2393                                 {
2394                                         packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2395                                 }
2396                                 else
2397                                 {
2398                                         packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2399                                 }
2400                                 value = *Pointer<UInt2>(buffer, 16);
2401                                 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2402                                 if(rgbaWriteMask != 0xF)
2403                                 {
2404                                         mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2405                                 }
2406                                 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2407                         }
2408                         break;
2409                 default:
2410                         ASSERT(false);
2411                 }
2412         }
2413
2414         UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2415         {
2416                 return UShort4(cf * Float4(0xFFFF), saturate);
2417         }
2418
2419         void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2420         {
2421                 Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2422
2423                 c.x = As<UShort4>(c.x) >> 4;
2424                 c.y = As<UShort4>(c.y) >> 4;
2425                 c.z = As<UShort4>(c.z) >> 4;
2426
2427                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2428                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2429                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2430                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2431
2432                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2433                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2434                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2435                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2436
2437                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2438                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2439                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2440                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2441         }
2442
2443         void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2444         {
2445                 c.x = As<UShort4>(c.x) >> 4;
2446                 c.y = As<UShort4>(c.y) >> 4;
2447                 c.z = As<UShort4>(c.z) >> 4;
2448
2449                 linearToSRGB12_16(c);
2450         }
2451
2452         void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2453         {
2454                 Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2455
2456                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2457                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2458                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2459                 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2460
2461                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2462                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2463                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2464                 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2465
2466                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2467                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2468                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2469                 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2470         }
2471
2472         Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2473         {
2474                 Float4 linear = x * x;
2475                 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2476
2477                 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2478         }
2479
2480         bool PixelRoutine::colorUsed()
2481         {
2482                 return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2483         }
2484 }