OSDN Git Service

b4d326ebdcc2a34ad98d236cdd0073559e6d50ef
[android-x86/external-mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / fetch_jit.cpp
1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file fetch_jit.cpp
24  *
25  * @brief Implementation of the fetch jitter
26  *
27  * Notes:
28  *
29  ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder_gfx_mem.h"
32 #include "jit_api.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
36
37 //#define FETCH_DUMP_VERTEX 1
38 using namespace llvm;
39 using namespace SwrJit;
40
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43 enum ConversionType
44 {
45     CONVERT_NONE,
46     CONVERT_NORMALIZED,
47     CONVERT_USCALED,
48     CONVERT_SSCALED,
49     CONVERT_SFIXED,
50 };
51
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public BuilderGfxMem
56 {
57     FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {}
58
59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
60
61     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
62     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
63     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
64     template <typename T>
65     Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
66
67     // package up Shuffle*bpcGatherd args into a tuple for convenience
68     typedef std::tuple<Value*&,
69                        Value*,
70                        const Instruction::CastOps,
71                        const ConversionType,
72                        uint32_t&,
73                        uint32_t&,
74                        const ComponentEnable,
75                        const ComponentControl (&)[4],
76                        Value* (&)[4],
77                        const uint32_t (&)[4]>
78         Shuffle8bpcArgs;
79
80     void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
81     void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
82
83     typedef std::tuple<Value* (&)[2],
84                        Value*,
85                        const Instruction::CastOps,
86                        const ConversionType,
87                        uint32_t&,
88                        uint32_t&,
89                        const ComponentEnable,
90                        const ComponentControl (&)[4],
91                        Value* (&)[4]>
92         Shuffle16bpcArgs;
93
94     void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
95     void Shuffle16bpcGather(Shuffle16bpcArgs& args);
96
97     void StoreVertexElements(Value*         pVtxOut,
98                              const uint32_t outputElt,
99                              const uint32_t numEltsToStore,
100                              Value* (&vVertexElements)[4]);
101
102     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
103
104     void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
105                            Value*                     streams,
106                            Value*                     vIndices,
107                            Value*                     pVtxOut);
108
109     bool IsOddFormat(SWR_FORMAT format);
110     bool IsUniformFormat(SWR_FORMAT format);
111     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
112     void CreateGatherOddFormats(
113         SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
114     void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
115
116     Value* mpWorkerData;
117     Value* mpFetchInfo;
118 };
119
120 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
121 {
122     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
123     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
124
125     Function* fetch = Function::Create(
126         JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
127     BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
128
129     fetch->getParent()->setModuleIdentifier(fetch->getName());
130
131     IRB()->SetInsertPoint(entry);
132
133     auto argitr = fetch->arg_begin();
134
135     // Fetch shader arguments
136     Value* privateContext = &*argitr;
137     ++argitr;
138     privateContext->setName("privateContext");
139     SetPrivateContext(privateContext);
140
141     mpWorkerData = &*argitr;
142     ++argitr;
143     mpWorkerData->setName("pWorkerData");
144     mpFetchInfo = &*argitr;
145     ++argitr;
146     mpFetchInfo->setName("fetchInfo");
147     Value* pVtxOut = &*argitr;
148     pVtxOut->setName("vtxOutput");
149
150     uint32_t baseWidth = mVWidth;
151
152     SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
153
154     // Override builder target width to force 16-wide SIMD
155 #if USE_SIMD16_SHADERS
156     SetTargetWidth(16);
157 #endif
158
159     pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
160
161     // SWR_FETCH_CONTEXT::pStreams
162     Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
163     streams->setName("pStreams");
164
165     // SWR_FETCH_CONTEXT::pIndices
166     Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
167     indices->setName("pIndices");
168
169     // SWR_FETCH_CONTEXT::pLastIndex
170     Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
171     pLastIndex->setName("pLastIndex");
172
173     Value* vIndices;
174     switch (fetchState.indexType)
175     {
176     case R8_UINT:
177         indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
178         if (fetchState.bDisableIndexOOBCheck)
179         {
180             vIndices = LOAD(
181                 BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)),
182                 {(uint32_t)0});
183             vIndices = Z_EXT(vIndices, mSimdInt32Ty);
184         }
185         else
186         {
187             vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
188         }
189         break;
190     case R16_UINT:
191         if (fetchState.bDisableIndexOOBCheck)
192         {
193             vIndices = LOAD(
194                 BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)),
195                 {(uint32_t)0});
196             vIndices = Z_EXT(vIndices, mSimdInt32Ty);
197         }
198         else
199         {
200             vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
201         }
202         break;
203     case R32_UINT:
204         (fetchState.bDisableIndexOOBCheck)
205             ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
206             : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
207         break; // incoming type is already 32bit int
208     default:
209         SWR_INVALID("Unsupported index type");
210         vIndices = nullptr;
211         break;
212     }
213
214     if (fetchState.bForceSequentialAccessEnable)
215     {
216         Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
217                                        : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
218
219         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
220         vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
221         vIndices = ADD(vIndices, pOffsets);
222     }
223
224     Value* vVertexId = vIndices;
225     if (fetchState.bVertexIDOffsetEnable)
226     {
227         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
228         // correct
229         Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
230         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
231         vVertexId           = ADD(vIndices, vBaseVertex);
232         vVertexId           = ADD(vVertexId, vStartVertex);
233     }
234
235     // store out vertex IDs
236     if (mVWidth == 16)
237     {
238         // store out in simd8 halves until core supports 16-wide natively
239         auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
240         auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
241         STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
242         STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
243     }
244     else if (mVWidth == 8)
245     {
246         STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
247     }
248
249     // store out cut mask if enabled
250     if (fetchState.bEnableCutIndex)
251     {
252         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
253         Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
254
255         if (mVWidth == 16)
256         {
257             auto cutMaskLo = EXTRACT_16(cutMask, 0);
258             auto cutMaskHi = EXTRACT_16(cutMask, 1);
259             STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
260             STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
261         }
262         else if (mVWidth == 8)
263         {
264             STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
265         }
266     }
267
268     // Fetch attributes from memory and output to a simdvertex struct
269     JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
270
271     RET_VOID();
272
273     JitManager::DumpToFile(fetch, "src");
274
275 #if defined(_DEBUG)
276     verifyFunction(*fetch);
277 #endif
278
279     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
280
281     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
282     setupPasses.add(createBreakCriticalEdgesPass());
283     setupPasses.add(createCFGSimplificationPass());
284     setupPasses.add(createEarlyCSEPass());
285     setupPasses.add(createPromoteMemoryToRegisterPass());
286
287     setupPasses.run(*fetch);
288
289     JitManager::DumpToFile(fetch, "se");
290
291     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
292
293     ///@todo Haven't touched these either. Need to remove some of these and add others.
294     optPasses.add(createCFGSimplificationPass());
295     optPasses.add(createEarlyCSEPass());
296     optPasses.add(createInstructionCombiningPass());
297     optPasses.add(createInstructionSimplifierPass());
298     optPasses.add(createConstantPropagationPass());
299     optPasses.add(createSCCPPass());
300     optPasses.add(createAggressiveDCEPass());
301
302     optPasses.run(*fetch);
303
304     optPasses.add(createLowerX86Pass(this));
305     optPasses.run(*fetch);
306
307     JitManager::DumpToFile(fetch, "opt");
308
309
310     // Revert 16-wide override
311 #if USE_SIMD16_SHADERS
312     SetTargetWidth(baseWidth);
313 #endif
314
315     return fetch;
316 }
317
318 // returns true for odd formats that require special state.gather handling
319 bool FetchJit::IsOddFormat(SWR_FORMAT format)
320 {
321     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
322     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
323     {
324         return true;
325     }
326     return false;
327 }
328
329 // format is uniform if all components are the same size and type
330 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
331 {
332     const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
333     uint32_t               bpc0  = info.bpc[0];
334     uint32_t               type0 = info.type[0];
335
336     for (uint32_t c = 1; c < info.numComps; ++c)
337     {
338         if (bpc0 != info.bpc[c] || type0 != info.type[c])
339         {
340             return false;
341         }
342     }
343     return true;
344 }
345
346 // unpacks components based on format
347 // foreach component in the pixel
348 //   mask off everything but this component
349 //   shift component to LSB
350 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
351 {
352     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
353
354     uint32_t bitOffset = 0;
355     for (uint32_t c = 0; c < info.numComps; ++c)
356     {
357         uint32_t swizzledIndex = info.swizzle[c];
358         uint32_t compBits      = info.bpc[c];
359         uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
360         Value*   comp          = AND(vInput, bitmask);
361         comp                   = LSHR(comp, bitOffset);
362
363         result[swizzledIndex] = comp;
364         bitOffset += compBits;
365     }
366 }
367
368 // gather for odd component size formats
369 // gather SIMD full pixels per lane then shift/mask to move each component to their
370 // own vector
371 void FetchJit::CreateGatherOddFormats(
372     SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
373 {
374     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
375
376     // only works if pixel size is <= 32bits
377     SWR_ASSERT(info.bpp <= 32);
378
379     Value* pGather;
380     if (info.bpp == 32)
381     {
382         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
383     }
384     else
385     {
386         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
387         Value* pMem = ALLOCA(mSimdInt32Ty);
388         STORE(VIMMED1(0u), pMem);
389
390         pBase          = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
391         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
392
393         for (uint32_t lane = 0; lane < mVWidth; ++lane)
394         {
395             // Get index
396             Value* index = VEXTRACT(pOffsets, C(lane));
397             Value* mask  = VEXTRACT(pMask, C(lane));
398             switch (info.bpp)
399             {
400             case 8:
401             {
402                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
403                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
404                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
405                 break;
406             }
407
408             case 16:
409             {
410                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
411                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
412                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
413                 break;
414             }
415             break;
416
417             case 24:
418             {
419                 // First 16-bits of data
420                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
421                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
422                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
423
424                 // Last 8-bits of data
425                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
426                 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
427                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
428                 break;
429             }
430
431             default:
432                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
433                 break;
434             }
435         }
436
437         pGather = LOAD(pMem);
438     }
439
440     for (uint32_t comp = 0; comp < 4; ++comp)
441     {
442         pResult[comp] = VIMMED1((int)info.defaults[comp]);
443     }
444
445     UnpackComponents(format, pGather, pResult);
446
447     // cast to fp32
448     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
449     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
450     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
451     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
452 }
453
454 void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
455 {
456     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
457
458     for (uint32_t c = 0; c < info.numComps; ++c)
459     {
460         uint32_t compIndex = info.swizzle[c];
461
462         // skip any conversion on UNUSED components
463         if (info.type[c] == SWR_TYPE_UNUSED)
464         {
465             continue;
466         }
467
468         if (info.isNormalized[c])
469         {
470             if (info.type[c] == SWR_TYPE_SNORM)
471             {
472                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
473                 /// -1.0f.
474
475                 /// result = c * (1.0f / (2^(n-1) - 1);
476                 uint32_t n        = info.bpc[c];
477                 uint32_t pow2     = 1 << (n - 1);
478                 float    scale    = 1.0f / (float)(pow2 - 1);
479                 Value*   vScale   = VIMMED1(scale);
480                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
481                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
482                 texels[compIndex] = FMUL(texels[compIndex], vScale);
483             }
484             else
485             {
486                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
487
488                 /// result = c * (1.0f / (2^n - 1))
489                 uint32_t n    = info.bpc[c];
490                 uint32_t pow2 = 1 << n;
491                 // special case 24bit unorm format, which requires a full divide to meet ULP
492                 // requirement
493                 if (n == 24)
494                 {
495                     float  scale      = (float)(pow2 - 1);
496                     Value* vScale     = VIMMED1(scale);
497                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
498                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
499                     texels[compIndex] = FDIV(texels[compIndex], vScale);
500                 }
501                 else
502                 {
503                     float  scale      = 1.0f / (float)(pow2 - 1);
504                     Value* vScale     = VIMMED1(scale);
505                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
506                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
507                     texels[compIndex] = FMUL(texels[compIndex], vScale);
508                 }
509             }
510             continue;
511         }
512     }
513 }
514
515 //////////////////////////////////////////////////////////////////////////
516 /// @brief Loads attributes from memory using AVX2 GATHER(s)
517 /// @param fetchState - info about attributes to be fetched from memory
518 /// @param streams - value pointer to the current vertex stream
519 /// @param vIndices - vector value of indices to gather
520 /// @param pVtxOut - value pointer to output simdvertex struct
521 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
522                                  Value*                     streams,
523                                  Value*                     vIndices,
524                                  Value*                     pVtxOut)
525 {
526     uint32_t currentVertexElement = 0;
527     uint32_t outputElt            = 0;
528     Value*   vVertexElements[4];
529
530     Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
531     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
532     Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
533     Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
534     curInstance->setName("curInstance");
535
536     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
537     {
538         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
539
540         // skip element if all components are disabled
541         if (ied.ComponentPacking == ComponentEnable::NONE)
542         {
543             continue;
544         }
545
546         const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
547         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
548         uint32_t bpc =
549             info.bpp /
550             info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
551
552         Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
553
554         // VGATHER* takes an *i8 src pointer
555         Value* pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
556
557         Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
558         Value* vStride = VBROADCAST(stride);
559
560         // max vertex index that is fully in bounds
561         Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
562         maxVertex        = LOAD(maxVertex);
563
564         Value* minVertex = NULL;
565         if (fetchState.bPartialVertexBuffer)
566         {
567             // min vertex index for low bounds OOB checking
568             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
569             minVertex = LOAD(minVertex);
570         }
571
572         if (fetchState.bInstanceIDOffsetEnable)
573         {
574             // the InstanceID (curInstance) value is offset by StartInstanceLocation
575             curInstance = ADD(curInstance, startInstance);
576         }
577
578         Value* vCurIndices;
579         Value* startOffset;
580         Value* vInstanceStride = VIMMED1(0);
581
582         if (ied.InstanceEnable)
583         {
584             Value* stepRate = C(ied.InstanceAdvancementState);
585
586             // prevent a div by 0 for 0 step rate
587             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
588             stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
589
590             // calc the current offset into instanced data buffer
591             Value* calcInstance = UDIV(curInstance, stepRate);
592
593             // if step rate is 0, every instance gets instance 0
594             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
595
596             vCurIndices = VBROADCAST(calcInstance);
597             startOffset = startInstance;
598         }
599         else if (ied.InstanceStrideEnable)
600         {
601             // grab the instance advancement state, determines stride in bytes from one instance to
602             // the next
603             Value* stepRate = C(ied.InstanceAdvancementState);
604             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
605
606             // offset indices by baseVertex
607             vCurIndices = ADD(vIndices, vBaseVertex);
608
609             startOffset = startVertex;
610             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
611         }
612         else
613         {
614             // offset indices by baseVertex
615             vCurIndices = ADD(vIndices, vBaseVertex);
616             startOffset = startVertex;
617         }
618
619         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
620         // do 64bit address offset calculations.
621
622         // calculate byte offset to the start of the VB
623         Value* baseOffset     = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
624         pStreamBase           = GEP(pStreamBase, baseOffset);
625         Value* pStreamBaseGFX = ADD(stream, baseOffset);
626
627         // if we have a start offset, subtract from max vertex. Used for OOB check
628         maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
629         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
630         // if we have a negative value, we're already OOB. clamp at 0.
631         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
632
633         if (fetchState.bPartialVertexBuffer)
634         {
635             // similary for min vertex
636             minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
637             Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
638             minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
639         }
640
641         // Load the in bounds size of a partially valid vertex
642         Value* partialInboundsSize =
643             GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
644         partialInboundsSize       = LOAD(partialInboundsSize);
645         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
646         Value* vBpp               = VBROADCAST(C(info.Bpp));
647         Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
648
649         // is the element is <= the partially valid size
650         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
651
652         // override cur indices with 0 if pitch is 0
653         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
654         vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
655
656         // are vertices partially OOB?
657         Value* vMaxVertex      = VBROADCAST(maxVertex);
658         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
659
660         // are vertices fully in bounds?
661         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
662
663         Value* vGatherMask;
664         if (fetchState.bPartialVertexBuffer)
665         {
666             // are vertices below minVertex limit?
667             Value* vMinVertex     = VBROADCAST(minVertex);
668             Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
669
670             // only fetch lanes that pass both tests
671             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
672         }
673         else
674         {
675             vGatherMask = vMaxGatherMask;
676         }
677
678         // blend in any partially OOB indices that have valid elements
679         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
680
681         // calculate the actual offsets into the VB
682         Value* vOffsets = MUL(vCurIndices, vStride);
683         vOffsets        = ADD(vOffsets, vAlignmentOffsets);
684
685         // if instance stride enable is:
686         //  true  - add product of the instanceID and advancement state to the offst into the VB
687         //  false - value of vInstanceStride has been initialialized to zero
688         vOffsets = ADD(vOffsets, vInstanceStride);
689
690         // Packing and component control
691         ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
692         const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
693                                            (ComponentControl)ied.ComponentControl1,
694                                            (ComponentControl)ied.ComponentControl2,
695                                            (ComponentControl)ied.ComponentControl3};
696
697         // Special gather/conversion for formats without equal component sizes
698         if (IsOddFormat((SWR_FORMAT)ied.Format))
699         {
700             Value* pResults[4];
701             CreateGatherOddFormats(
702                 (SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
703             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
704
705             for (uint32_t c = 0; c < 4; c += 1)
706             {
707                 if (isComponentEnabled(compMask, c))
708                 {
709                     vVertexElements[currentVertexElement++] = pResults[c];
710                     if (currentVertexElement > 3)
711                     {
712                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
713                         // reset to the next vVertexElement to output
714                         currentVertexElement = 0;
715                     }
716                 }
717             }
718         }
719         else if (info.type[0] == SWR_TYPE_FLOAT)
720         {
721             ///@todo: support 64 bit vb accesses
722             Value* gatherSrc = VIMMED1(0.0f);
723
724             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
725                        "Unsupported format for standard gather fetch.");
726
727             // Gather components from memory to store in a simdvertex structure
728             switch (bpc)
729             {
730             case 16:
731             {
732                 Value* vGatherResult[2];
733
734                 // if we have at least one component out of x or y to fetch
735                 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
736                 {
737                     vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
738                     // e.g. result of first 8x32bit integer gather for 16bit components
739                     // 256i - 0    1    2    3    4    5    6    7
740                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
741                     //
742                 }
743
744                 // if we have at least one component out of z or w to fetch
745                 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
746                 {
747                     // offset base to the next components(zw) in the vertex to gather
748                     pStreamBase = GEP(pStreamBase, C((char)4));
749
750                     vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
751                     // e.g. result of second 8x32bit integer gather for 16bit components
752                     // 256i - 0    1    2    3    4    5    6    7
753                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
754                     //
755                 }
756
757                 // if we have at least one component to shuffle into place
758                 if (compMask)
759                 {
760                     Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
761                                                                   pVtxOut,
762                                                                   Instruction::CastOps::FPExt,
763                                                                   CONVERT_NONE,
764                                                                   currentVertexElement,
765                                                                   outputElt,
766                                                                   compMask,
767                                                                   compCtrl,
768                                                                   vVertexElements);
769
770                     // Shuffle gathered components into place in simdvertex struct
771                     mVWidth == 16 ? Shuffle16bpcGather16(args)
772                                   : Shuffle16bpcGather(args); // outputs to vVertexElements ref
773                 }
774             }
775             break;
776             case 32:
777             {
778                 for (uint32_t i = 0; i < 4; i += 1)
779                 {
780                     if (isComponentEnabled(compMask, i))
781                     {
782                         // if we need to gather the component
783                         if (compCtrl[i] == StoreSrc)
784                         {
785                             // Gather a SIMD of vertices
786                             // APIs allow a 4GB range for offsets
787                             // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
788                             // But, we know that elements must be aligned for FETCH. :)
789                             // Right shift the offset by a bit and then scale by 2 to remove the
790                             // sign extension.
791                             Value* vShiftedOffsets = LSHR(vOffsets, 1);
792                             vVertexElements[currentVertexElement++] =
793                                 GATHERPS(gatherSrc,
794                                          pStreamBaseGFX,
795                                          vShiftedOffsets,
796                                          vGatherMask,
797                                          2,
798                                          GFX_MEM_CLIENT_FETCH);
799                         }
800                         else
801                         {
802                             vVertexElements[currentVertexElement++] =
803                                 GenerateCompCtrlVector(compCtrl[i]);
804                         }
805
806                         if (currentVertexElement > 3)
807                         {
808                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
809                             // reset to the next vVertexElement to output
810                             currentVertexElement = 0;
811                         }
812                     }
813
814                     // offset base to the next component in the vertex to gather
815                     pStreamBase    = GEP(pStreamBase, C((char)4));
816                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
817                 }
818             }
819             break;
820             case 64:
821             {
822                 for (uint32_t i = 0; i < 4; i += 1)
823                 {
824                     if (isComponentEnabled(compMask, i))
825                     {
826                         // if we need to gather the component
827                         if (compCtrl[i] == StoreSrc)
828                         {
829                             Value* vShufLo;
830                             Value* vShufHi;
831                             Value* vShufAll;
832
833                             if (mVWidth == 8)
834                             {
835                                 vShufLo  = C({0, 1, 2, 3});
836                                 vShufHi  = C({4, 5, 6, 7});
837                                 vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
838                             }
839                             else
840                             {
841                                 SWR_ASSERT(mVWidth == 16);
842                                 vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
843                                 vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
844                                 vShufAll =
845                                     C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
846                             }
847
848                             Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
849                             Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
850
851                             Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
852                             Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
853
854                             Value* vZeroDouble = VECTOR_SPLAT(
855                                 mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
856
857                             Value* pGatherLo =
858                                 GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
859                             Value* pGatherHi =
860                                 GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
861
862                             pGatherLo = VCVTPD2PS(pGatherLo);
863                             pGatherHi = VCVTPD2PS(pGatherHi);
864
865                             Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
866
867                             vVertexElements[currentVertexElement++] = pGather;
868                         }
869                         else
870                         {
871                             vVertexElements[currentVertexElement++] =
872                                 GenerateCompCtrlVector(compCtrl[i]);
873                         }
874
875                         if (currentVertexElement > 3)
876                         {
877                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
878                             // reset to the next vVertexElement to output
879                             currentVertexElement = 0;
880                         }
881                     }
882
883                     // offset base to the next component  in the vertex to gather
884                     pStreamBase = GEP(pStreamBase, C((char)8));
885                 }
886             }
887             break;
888             default:
889                 SWR_INVALID("Tried to fetch invalid FP format");
890                 break;
891             }
892         }
893         else
894         {
895             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
896             ConversionType       conversionType = CONVERT_NONE;
897
898             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
899                        "Unsupported format for standard gather fetch.");
900
901             switch (info.type[0])
902             {
903             case SWR_TYPE_UNORM:
904                 conversionType = CONVERT_NORMALIZED;
905             case SWR_TYPE_UINT:
906                 extendCastType = Instruction::CastOps::ZExt;
907                 break;
908             case SWR_TYPE_SNORM:
909                 conversionType = CONVERT_NORMALIZED;
910             case SWR_TYPE_SINT:
911                 extendCastType = Instruction::CastOps::SExt;
912                 break;
913             case SWR_TYPE_USCALED:
914                 conversionType = CONVERT_USCALED;
915                 extendCastType = Instruction::CastOps::UIToFP;
916                 break;
917             case SWR_TYPE_SSCALED:
918                 conversionType = CONVERT_SSCALED;
919                 extendCastType = Instruction::CastOps::SIToFP;
920                 break;
921             case SWR_TYPE_SFIXED:
922                 conversionType = CONVERT_SFIXED;
923                 extendCastType = Instruction::CastOps::SExt;
924                 break;
925             default:
926                 break;
927             }
928
929             // value substituted when component of gather is masked
930             Value* gatherSrc = VIMMED1(0);
931
932             // Gather components from memory to store in a simdvertex structure
933             switch (bpc)
934             {
935             case 8:
936             {
937                 // if we have at least one component to fetch
938                 if (compMask)
939                 {
940                     Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
941                     // e.g. result of an 8x32bit integer gather for 8bit components
942                     // 256i - 0    1    2    3    4    5    6    7
943                     //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
944
945                     Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
946                                                                  pVtxOut,
947                                                                  extendCastType,
948                                                                  conversionType,
949                                                                  currentVertexElement,
950                                                                  outputElt,
951                                                                  compMask,
952                                                                  compCtrl,
953                                                                  vVertexElements,
954                                                                  info.swizzle);
955
956                     // Shuffle gathered components into place in simdvertex struct
957                     mVWidth == 16 ? Shuffle8bpcGatherd16(args)
958                                   : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
959                 }
960             }
961             break;
962             case 16:
963             {
964                 Value* vGatherResult[2];
965
966                 // if we have at least one component out of x or y to fetch
967                 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
968                 {
969                     vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
970                     // e.g. result of first 8x32bit integer gather for 16bit components
971                     // 256i - 0    1    2    3    4    5    6    7
972                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
973                     //
974                 }
975
976                 // if we have at least one component out of z or w to fetch
977                 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
978                 {
979                     // offset base to the next components(zw) in the vertex to gather
980                     pStreamBase = GEP(pStreamBase, C((char)4));
981
982                     vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
983                     // e.g. result of second 8x32bit integer gather for 16bit components
984                     // 256i - 0    1    2    3    4    5    6    7
985                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
986                     //
987                 }
988
989                 // if we have at least one component to shuffle into place
990                 if (compMask)
991                 {
992                     Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
993                                                                   pVtxOut,
994                                                                   extendCastType,
995                                                                   conversionType,
996                                                                   currentVertexElement,
997                                                                   outputElt,
998                                                                   compMask,
999                                                                   compCtrl,
1000                                                                   vVertexElements);
1001
1002                     // Shuffle gathered components into place in simdvertex struct
1003                     mVWidth == 16 ? Shuffle16bpcGather16(args)
1004                                   : Shuffle16bpcGather(args); // outputs to vVertexElements ref
1005                 }
1006             }
1007             break;
1008             case 32:
1009             {
1010                 // Gathered components into place in simdvertex struct
1011                 for (uint32_t i = 0; i < 4; i++)
1012                 {
1013                     if (isComponentEnabled(compMask, i))
1014                     {
1015                         // if we need to gather the component
1016                         if (compCtrl[i] == StoreSrc)
1017                         {
1018                             Value* pGather =
1019                                 GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1020
1021                             if (conversionType == CONVERT_USCALED)
1022                             {
1023                                 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1024                             }
1025                             else if (conversionType == CONVERT_SSCALED)
1026                             {
1027                                 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1028                             }
1029                             else if (conversionType == CONVERT_SFIXED)
1030                             {
1031                                 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1032                                                VBROADCAST(C(1 / 65536.0f)));
1033                             }
1034
1035                             vVertexElements[currentVertexElement++] = pGather;
1036
1037                             // e.g. result of a single 8x32bit integer gather for 32bit components
1038                             // 256i - 0    1    2    3    4    5    6    7
1039                             //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1040                         }
1041                         else
1042                         {
1043                             vVertexElements[currentVertexElement++] =
1044                                 GenerateCompCtrlVector(compCtrl[i]);
1045                         }
1046
1047                         if (currentVertexElement > 3)
1048                         {
1049                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1050
1051                             // reset to the next vVertexElement to output
1052                             currentVertexElement = 0;
1053                         }
1054                     }
1055
1056                     // offset base to the next component  in the vertex to gather
1057                     pStreamBase = GEP(pStreamBase, C((char)4));
1058                 }
1059             }
1060             break;
1061             }
1062         }
1063     }
1064
1065     // if we have a partially filled vVertexElement struct, output it
1066     if (currentVertexElement > 0)
1067     {
1068         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1069     }
1070 }
1071
1072 template <typename T>
1073 Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1074 {
1075     SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1076                "Function expects gfxptr_t for both input parameters.");
1077
1078     Type* Ty = nullptr;
1079
1080     static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1081                   "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1082     constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1083     if (bSize)
1084     {
1085         Ty = mInt16PtrTy;
1086     }
1087     else if (sizeof(T) == sizeof(uint8_t))
1088     {
1089         Ty = mInt8PtrTy;
1090     }
1091     else
1092     {
1093         SWR_ASSERT(false, "This should never happen as per static_assert above.");
1094     }
1095
1096     Value* vIndices = VUNDEF_I();
1097
1098     {
1099         // store 0 index on stack to be used to conditionally load from if index address is OOB
1100         Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1101         STORE(C((T)0), pZeroIndex);
1102
1103         // Load a SIMD of index pointers
1104         for (int64_t lane = 0; lane < mVWidth; lane++)
1105         {
1106             // Calculate the address of the requested index
1107             Value* pIndex = GEP(pIndices, C(lane), Ty);
1108
1109             pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1110
1111             // check if the address is less than the max index,
1112             Value* mask = ICMP_ULT(pIndex, pLastIndex);
1113
1114             // if valid, load the index. if not, load 0 from the stack
1115             Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1116             Value* index  = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH);
1117
1118             // zero extended index to 32 bits and insert into the correct simd lane
1119             index    = Z_EXT(index, mInt32Ty);
1120             vIndices = VINSERT(vIndices, index, lane);
1121         }
1122     }
1123
1124     return vIndices;
1125 }
1126
1127 //////////////////////////////////////////////////////////////////////////
1128 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1129 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1130 /// support
1131 /// @param pIndices - pointer to 8 bit indices
1132 /// @param pLastIndex - pointer to last valid index
1133 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1134 {
1135     return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1136 }
1137
1138 //////////////////////////////////////////////////////////////////////////
1139 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1140 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1141 /// support
1142 /// @param pIndices - pointer to 16 bit indices
1143 /// @param pLastIndex - pointer to last valid index
1144 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1145 {
1146     return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1147 }
1148
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1151 /// @param pIndices - pointer to 32 bit indices
1152 /// @param pLastIndex - pointer to last valid index
1153 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1154 {
1155     DataLayout dL(JM()->mpCurrentModule);
1156     Value*     iLastIndex = pLastIndex;
1157     Value*     iIndices   = pIndices;
1158
1159     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1160     Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1161     numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
1162     numIndicesLeft        = SDIV(numIndicesLeft, C(4));
1163
1164     // create a vector of index counts from the base index ptr passed into the fetch
1165     Constant* vIndexOffsets;
1166     if (mVWidth == 8)
1167     {
1168         vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1169     }
1170     else
1171     {
1172         vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1173     }
1174
1175     // compare index count to the max valid index
1176     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1177     //     vIndexOffsets  0 1 2 3 4 5 6 7
1178     //     ------------------------------
1179     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1180     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1181     Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
1182     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1183
1184     // Load the indices; OOB loads 0
1185     return MASKED_LOAD(pIndices,
1186                        4,
1187                        vIndexMask,
1188                        VIMMED1(0),
1189                        "vIndices",
1190                        PointerType::get(mSimdInt32Ty, 0),
1191                        GFX_MEM_CLIENT_FETCH);
1192 }
1193
1194 //////////////////////////////////////////////////////////////////////////
1195 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1196 /// denormalizes if needed, converts to F32 if needed, and positions in
1197 //  the proper SIMD rows to be output to the simdvertex structure
1198 /// @param args: (tuple of args, listed below)
1199 ///   @param vGatherResult - 8 gathered 8bpc vertices
1200 ///   @param pVtxOut - base pointer to output simdvertex struct
1201 ///   @param extendType - sign extend or zero extend
1202 ///   @param bNormalized - do we need to denormalize?
1203 ///   @param currentVertexElement - reference to the current vVertexElement
1204 ///   @param outputElt - reference to the current offset from simdvertex we're o
1205 ///   @param compMask - component packing mask
1206 ///   @param compCtrl - component control val
1207 ///   @param vVertexElements[4] - vertex components to output
1208 ///   @param swizzle[4] - component swizzle location
1209 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1210 {
1211     // Unpack tuple args
1212     Value*&                    vGatherResult        = std::get<0>(args);
1213     Value*                     pVtxOut              = std::get<1>(args);
1214     const Instruction::CastOps extendType           = std::get<2>(args);
1215     const ConversionType       conversionType       = std::get<3>(args);
1216     uint32_t&                  currentVertexElement = std::get<4>(args);
1217     uint32_t&                  outputElt            = std::get<5>(args);
1218     const ComponentEnable      compMask             = std::get<6>(args);
1219     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1220     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1221     const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1222
1223     // cast types
1224     Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1225     Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1226
1227     // have to do extra work for sign extending
1228     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1229     {
1230         Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1231         Type* v128Ty  = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1232
1233         // shuffle mask, including any swizzling
1234         const char x          = (char)swizzle[0];
1235         const char y          = (char)swizzle[1];
1236         const char z          = (char)swizzle[2];
1237         const char w          = (char)swizzle[3];
1238         Value*     vConstMask = C<char>(
1239             {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
1240              char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
1241              char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
1242              char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
1243              char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
1244              char(w + 8), char(w + 12)});
1245
1246         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1247
1248         Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1249         Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1250
1251         Value* vShufResult_lo =
1252             BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1253         Value* vShufResult_hi =
1254             BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1255
1256         // after pshufb: group components together in each 128bit lane
1257         // 256i - 0    1    2    3    4    5    6    7
1258         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1259
1260         Value* vi128XY_lo = nullptr;
1261         Value* vi128XY_hi = nullptr;
1262         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1263         {
1264             vi128XY_lo = BITCAST(
1265                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1266                 v128Ty);
1267             vi128XY_hi = BITCAST(
1268                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1269                 v128Ty);
1270
1271             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1272             // 256i - 0    1    2    3    4    5    6    7
1273             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1274         }
1275
1276         // do the same for zw components
1277         Value* vi128ZW_lo = nullptr;
1278         Value* vi128ZW_hi = nullptr;
1279         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1280         {
1281             vi128ZW_lo = BITCAST(
1282                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1283                 v128Ty);
1284             vi128ZW_hi = BITCAST(
1285                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1286                 v128Ty);
1287         }
1288
1289         // init denormalize variables if needed
1290         Instruction::CastOps fpCast;
1291         Value*               conversionFactor;
1292
1293         switch (conversionType)
1294         {
1295         case CONVERT_NORMALIZED:
1296             fpCast           = Instruction::CastOps::SIToFP;
1297             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1298             break;
1299         case CONVERT_SSCALED:
1300             fpCast           = Instruction::CastOps::SIToFP;
1301             conversionFactor = VIMMED1((float)(1.0));
1302             break;
1303         case CONVERT_USCALED:
1304             SWR_INVALID("Type should not be sign extended!");
1305             conversionFactor = nullptr;
1306             break;
1307         default:
1308             SWR_ASSERT(conversionType == CONVERT_NONE);
1309             conversionFactor = nullptr;
1310             break;
1311         }
1312
1313         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1314         // simdvertex
1315         for (uint32_t i = 0; i < 4; i++)
1316         {
1317             if (isComponentEnabled(compMask, i))
1318             {
1319                 if (compCtrl[i] == ComponentControl::StoreSrc)
1320                 {
1321                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1322                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1323                     // if x or y, use vi128XY permute result, else use vi128ZW
1324                     Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1325                     Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1326
1327                     // sign extend
1328                     Value* temp_lo =
1329                         PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1330                     Value* temp_hi =
1331                         PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1332
1333                     Value* temp = JOIN_16(temp_lo, temp_hi);
1334
1335                     // denormalize if needed
1336                     if (conversionType != CONVERT_NONE)
1337                     {
1338                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1339                     }
1340
1341                     vVertexElements[currentVertexElement] = temp;
1342
1343                     currentVertexElement += 1;
1344                 }
1345                 else
1346                 {
1347                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1348                 }
1349
1350                 if (currentVertexElement > 3)
1351                 {
1352                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1353                     // reset to the next vVertexElement to output
1354                     currentVertexElement = 0;
1355                 }
1356             }
1357         }
1358     }
1359     // else zero extend
1360     else if ((extendType == Instruction::CastOps::ZExt) ||
1361              (extendType == Instruction::CastOps::UIToFP))
1362     {
1363         // init denormalize variables if needed
1364         Instruction::CastOps fpCast;
1365         Value*               conversionFactor;
1366
1367         switch (conversionType)
1368         {
1369         case CONVERT_NORMALIZED:
1370             fpCast           = Instruction::CastOps::UIToFP;
1371             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1372             break;
1373         case CONVERT_USCALED:
1374             fpCast           = Instruction::CastOps::UIToFP;
1375             conversionFactor = VIMMED1((float)(1.0));
1376             break;
1377         case CONVERT_SSCALED:
1378             SWR_INVALID("Type should not be zero extended!");
1379             conversionFactor = nullptr;
1380             break;
1381         default:
1382             SWR_ASSERT(conversionType == CONVERT_NONE);
1383             conversionFactor = nullptr;
1384             break;
1385         }
1386
1387         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1388         for (uint32_t i = 0; i < 4; i++)
1389         {
1390             if (isComponentEnabled(compMask, i))
1391             {
1392                 if (compCtrl[i] == ComponentControl::StoreSrc)
1393                 {
1394                     // pshufb masks for each component
1395                     Value* vConstMask;
1396                     switch (swizzle[i])
1397                     {
1398                     case 0:
1399                         // x shuffle mask
1400                         vConstMask =
1401                             C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1402                                      0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1403                         break;
1404                     case 1:
1405                         // y shuffle mask
1406                         vConstMask =
1407                             C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1408                                      1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1409                         break;
1410                     case 2:
1411                         // z shuffle mask
1412                         vConstMask =
1413                             C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1414                                      2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1415                         break;
1416                     case 3:
1417                         // w shuffle mask
1418                         vConstMask =
1419                             C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1420                                      3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1421                         break;
1422                     default:
1423                         vConstMask = nullptr;
1424                         break;
1425                     }
1426
1427                     Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1428                     Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1429
1430                     Value* temp_lo =
1431                         BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1432                     Value* temp_hi =
1433                         BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1434
1435                     // after pshufb for x channel
1436                     // 256i - 0    1    2    3    4    5    6    7
1437                     //        x000 x000 x000 x000 x000 x000 x000 x000
1438
1439                     Value* temp = JOIN_16(temp_lo, temp_hi);
1440
1441                     // denormalize if needed
1442                     if (conversionType != CONVERT_NONE)
1443                     {
1444                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1445                     }
1446
1447                     vVertexElements[currentVertexElement] = temp;
1448
1449                     currentVertexElement += 1;
1450                 }
1451                 else
1452                 {
1453                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1454                 }
1455
1456                 if (currentVertexElement > 3)
1457                 {
1458                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1459                     // reset to the next vVertexElement to output
1460                     currentVertexElement = 0;
1461                 }
1462             }
1463         }
1464     }
1465     else
1466     {
1467         SWR_INVALID("Unsupported conversion type");
1468     }
1469 }
1470
1471 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1472 {
1473     // Unpack tuple args
1474     Value*&                    vGatherResult        = std::get<0>(args);
1475     Value*                     pVtxOut              = std::get<1>(args);
1476     const Instruction::CastOps extendType           = std::get<2>(args);
1477     const ConversionType       conversionType       = std::get<3>(args);
1478     uint32_t&                  currentVertexElement = std::get<4>(args);
1479     uint32_t&                  outputElt            = std::get<5>(args);
1480     const ComponentEnable      compMask             = std::get<6>(args);
1481     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1482     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1483     const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1484
1485     // cast types
1486     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1487
1488     for (uint32_t i = 0; i < 4; i++)
1489     {
1490         if (!isComponentEnabled(compMask, i))
1491             continue;
1492
1493         if (compCtrl[i] == ComponentControl::StoreSrc)
1494         {
1495             std::vector<uint32_t> vShuffleMasks[4] = {
1496                 {0, 4, 8, 12, 16, 20, 24, 28},  // x
1497                 {1, 5, 9, 13, 17, 21, 25, 29},  // y
1498                 {2, 6, 10, 14, 18, 22, 26, 30}, // z
1499                 {3, 7, 11, 15, 19, 23, 27, 31}, // w
1500             };
1501
1502             Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1503                                   UndefValue::get(v32x8Ty),
1504                                   vShuffleMasks[swizzle[i]]);
1505
1506             if ((extendType == Instruction::CastOps::SExt) ||
1507                 (extendType == Instruction::CastOps::SIToFP))
1508             {
1509                 switch (conversionType)
1510                 {
1511                 case CONVERT_NORMALIZED:
1512                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1513                     break;
1514                 case CONVERT_SSCALED:
1515                     val = SI_TO_FP(val, mSimdFP32Ty);
1516                     break;
1517                 case CONVERT_USCALED:
1518                     SWR_INVALID("Type should not be sign extended!");
1519                     break;
1520                 default:
1521                     SWR_ASSERT(conversionType == CONVERT_NONE);
1522                     val = S_EXT(val, mSimdInt32Ty);
1523                     break;
1524                 }
1525             }
1526             else if ((extendType == Instruction::CastOps::ZExt) ||
1527                      (extendType == Instruction::CastOps::UIToFP))
1528             {
1529                 switch (conversionType)
1530                 {
1531                 case CONVERT_NORMALIZED:
1532                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1533                     break;
1534                 case CONVERT_SSCALED:
1535                     SWR_INVALID("Type should not be zero extended!");
1536                     break;
1537                 case CONVERT_USCALED:
1538                     val = UI_TO_FP(val, mSimdFP32Ty);
1539                     break;
1540                 default:
1541                     SWR_ASSERT(conversionType == CONVERT_NONE);
1542                     val = Z_EXT(val, mSimdInt32Ty);
1543                     break;
1544                 }
1545             }
1546             else
1547             {
1548                 SWR_INVALID("Unsupported conversion type");
1549             }
1550
1551             vVertexElements[currentVertexElement++] = val;
1552         }
1553         else
1554         {
1555             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1556         }
1557
1558         if (currentVertexElement > 3)
1559         {
1560             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1561             // reset to the next vVertexElement to output
1562             currentVertexElement = 0;
1563         }
1564     }
1565 }
1566
1567 //////////////////////////////////////////////////////////////////////////
1568 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1569 /// denormalizes if needed, converts to F32 if needed, and positions in
1570 //  the proper SIMD rows to be output to the simdvertex structure
1571 /// @param args: (tuple of args, listed below)
1572 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1573 ///   @param pVtxOut - base pointer to output simdvertex struct
1574 ///   @param extendType - sign extend or zero extend
1575 ///   @param bNormalized - do we need to denormalize?
1576 ///   @param currentVertexElement - reference to the current vVertexElement
1577 ///   @param outputElt - reference to the current offset from simdvertex we're o
1578 ///   @param compMask - component packing mask
1579 ///   @param compCtrl - component control val
1580 ///   @param vVertexElements[4] - vertex components to output
1581 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1582 {
1583     // Unpack tuple args
1584     Value*(&vGatherResult)[2]                       = std::get<0>(args);
1585     Value*                     pVtxOut              = std::get<1>(args);
1586     const Instruction::CastOps extendType           = std::get<2>(args);
1587     const ConversionType       conversionType       = std::get<3>(args);
1588     uint32_t&                  currentVertexElement = std::get<4>(args);
1589     uint32_t&                  outputElt            = std::get<5>(args);
1590     const ComponentEnable      compMask             = std::get<6>(args);
1591     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1592     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1593
1594     // cast types
1595     Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1596     Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1597
1598     // have to do extra work for sign extending
1599     if ((extendType == Instruction::CastOps::SExt) ||
1600         (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1601     {
1602         // is this PP float?
1603         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1604
1605         Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1606         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1607
1608         // shuffle mask
1609         Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1610                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1611         Value* vi128XY_lo = nullptr;
1612         Value* vi128XY_hi = nullptr;
1613         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1614         {
1615             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1616             // now..
1617
1618             Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1619             Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1620
1621             Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1622             Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1623
1624             // after pshufb: group components together in each 128bit lane
1625             // 256i - 0    1    2    3    4    5    6    7
1626             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1627
1628             vi128XY_lo = BITCAST(
1629                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1630                 v128bitTy);
1631             vi128XY_hi = BITCAST(
1632                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1633                 v128bitTy);
1634
1635             // after PERMD: move and pack xy components into each 128bit lane
1636             // 256i - 0    1    2    3    4    5    6    7
1637             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1638         }
1639
1640         // do the same for zw components
1641         Value* vi128ZW_lo = nullptr;
1642         Value* vi128ZW_hi = nullptr;
1643         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1644         {
1645             Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1646             Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1647
1648             Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1649             Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1650
1651             vi128ZW_lo = BITCAST(
1652                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1653                 v128bitTy);
1654             vi128ZW_hi = BITCAST(
1655                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1656                 v128bitTy);
1657         }
1658
1659         // init denormalize variables if needed
1660         Instruction::CastOps IntToFpCast;
1661         Value*               conversionFactor;
1662
1663         switch (conversionType)
1664         {
1665         case CONVERT_NORMALIZED:
1666             IntToFpCast      = Instruction::CastOps::SIToFP;
1667             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1668             break;
1669         case CONVERT_SSCALED:
1670             IntToFpCast      = Instruction::CastOps::SIToFP;
1671             conversionFactor = VIMMED1((float)(1.0));
1672             break;
1673         case CONVERT_USCALED:
1674             SWR_INVALID("Type should not be sign extended!");
1675             conversionFactor = nullptr;
1676             break;
1677         default:
1678             SWR_ASSERT(conversionType == CONVERT_NONE);
1679             conversionFactor = nullptr;
1680             break;
1681         }
1682
1683         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1684         // simdvertex
1685         for (uint32_t i = 0; i < 4; i++)
1686         {
1687             if (isComponentEnabled(compMask, i))
1688             {
1689                 if (compCtrl[i] == ComponentControl::StoreSrc)
1690                 {
1691                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1692                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1693                     // if x or y, use vi128XY permute result, else use vi128ZW
1694                     Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1695                     Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1696
1697                     if (bFP)
1698                     {
1699                         // extract 128 bit lanes to sign extend each component
1700                         Value* temp_lo =
1701                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1702                         Value* temp_hi =
1703                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1704
1705                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1706                     }
1707                     else
1708                     {
1709                         // extract 128 bit lanes to sign extend each component
1710                         Value* temp_lo =
1711                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1712                         Value* temp_hi =
1713                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1714
1715                         Value* temp = JOIN_16(temp_lo, temp_hi);
1716
1717                         // denormalize if needed
1718                         if (conversionType != CONVERT_NONE)
1719                         {
1720                             temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1721                         }
1722
1723                         vVertexElements[currentVertexElement] = temp;
1724                     }
1725
1726                     currentVertexElement += 1;
1727                 }
1728                 else
1729                 {
1730                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1731                 }
1732
1733                 if (currentVertexElement > 3)
1734                 {
1735                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1736                     // reset to the next vVertexElement to output
1737                     currentVertexElement = 0;
1738                 }
1739             }
1740         }
1741     }
1742     // else zero extend
1743     else if ((extendType == Instruction::CastOps::ZExt) ||
1744              (extendType == Instruction::CastOps::UIToFP))
1745     {
1746         // pshufb masks for each component
1747         Value* vConstMask[2];
1748
1749         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1750         {
1751             // x/z shuffle mask
1752             vConstMask[0] = C<char>({
1753                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1754                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1755             });
1756         }
1757
1758         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1759         {
1760             // y/w shuffle mask
1761             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1762                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1763         }
1764
1765         // init denormalize variables if needed
1766         Instruction::CastOps fpCast;
1767         Value*               conversionFactor;
1768
1769         switch (conversionType)
1770         {
1771         case CONVERT_NORMALIZED:
1772             fpCast           = Instruction::CastOps::UIToFP;
1773             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1774             break;
1775         case CONVERT_USCALED:
1776             fpCast           = Instruction::CastOps::UIToFP;
1777             conversionFactor = VIMMED1((float)(1.0f));
1778             break;
1779         case CONVERT_SSCALED:
1780             SWR_INVALID("Type should not be zero extended!");
1781             conversionFactor = nullptr;
1782             break;
1783         default:
1784             SWR_ASSERT(conversionType == CONVERT_NONE);
1785             conversionFactor = nullptr;
1786             break;
1787         }
1788
1789         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1790         for (uint32_t i = 0; i < 4; i++)
1791         {
1792             if (isComponentEnabled(compMask, i))
1793             {
1794                 if (compCtrl[i] == ComponentControl::StoreSrc)
1795                 {
1796                     // select correct constMask for x/z or y/w pshufb
1797                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1798                     // if x or y, use vi128XY permute result, else use vi128ZW
1799                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1800
1801                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL,
1802                     // for now..
1803
1804                     Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1805                     Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1806
1807                     Value* temp_lo = BITCAST(
1808                         PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1809                         vGatherTy);
1810                     Value* temp_hi = BITCAST(
1811                         PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1812                         vGatherTy);
1813
1814                     // after pshufb mask for x channel; z uses the same shuffle from the second
1815                     // gather 256i - 0    1    2    3    4    5    6    7
1816                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1817
1818                     Value* temp = JOIN_16(temp_lo, temp_hi);
1819
1820                     // denormalize if needed
1821                     if (conversionType != CONVERT_NONE)
1822                     {
1823                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1824                     }
1825
1826                     vVertexElements[currentVertexElement] = temp;
1827
1828                     currentVertexElement += 1;
1829                 }
1830                 else
1831                 {
1832                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1833                 }
1834
1835                 if (currentVertexElement > 3)
1836                 {
1837                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1838                     // reset to the next vVertexElement to output
1839                     currentVertexElement = 0;
1840                 }
1841             }
1842         }
1843     }
1844     else
1845     {
1846         SWR_INVALID("Unsupported conversion type");
1847     }
1848 }
1849
1850 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1851 {
1852     // Unpack tuple args
1853     Value*(&vGatherResult)[2]                       = std::get<0>(args);
1854     Value*                     pVtxOut              = std::get<1>(args);
1855     const Instruction::CastOps extendType           = std::get<2>(args);
1856     const ConversionType       conversionType       = std::get<3>(args);
1857     uint32_t&                  currentVertexElement = std::get<4>(args);
1858     uint32_t&                  outputElt            = std::get<5>(args);
1859     const ComponentEnable      compMask             = std::get<6>(args);
1860     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1861     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1862
1863     // cast types
1864     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1865     Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1866
1867     // have to do extra work for sign extending
1868     if ((extendType == Instruction::CastOps::SExt) ||
1869         (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1870     {
1871         // is this PP float?
1872         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1873
1874         Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1875         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
1876                                           mVWidth / 4); // vwidth is units of 32 bits
1877
1878         // shuffle mask
1879         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1880                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1881         Value* vi128XY    = nullptr;
1882         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1883         {
1884             Value* vShufResult =
1885                 BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1886             // after pshufb: group components together in each 128bit lane
1887             // 256i - 0    1    2    3    4    5    6    7
1888             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1889
1890             vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1891             // after PERMD: move and pack xy components into each 128bit lane
1892             // 256i - 0    1    2    3    4    5    6    7
1893             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1894         }
1895
1896         // do the same for zw components
1897         Value* vi128ZW = nullptr;
1898         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1899         {
1900             Value* vShufResult =
1901                 BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1902             vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1903         }
1904
1905         // init denormalize variables if needed
1906         Instruction::CastOps IntToFpCast;
1907         Value*               conversionFactor;
1908
1909         switch (conversionType)
1910         {
1911         case CONVERT_NORMALIZED:
1912             IntToFpCast      = Instruction::CastOps::SIToFP;
1913             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1914             break;
1915         case CONVERT_SSCALED:
1916             IntToFpCast      = Instruction::CastOps::SIToFP;
1917             conversionFactor = VIMMED1((float)(1.0));
1918             break;
1919         case CONVERT_USCALED:
1920             SWR_INVALID("Type should not be sign extended!");
1921             conversionFactor = nullptr;
1922             break;
1923         default:
1924             SWR_ASSERT(conversionType == CONVERT_NONE);
1925             conversionFactor = nullptr;
1926             break;
1927         }
1928
1929         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1930         // simdvertex
1931         for (uint32_t i = 0; i < 4; i++)
1932         {
1933             if (isComponentEnabled(compMask, i))
1934             {
1935                 if (compCtrl[i] == ComponentControl::StoreSrc)
1936                 {
1937                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1938                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1939                     // if x or y, use vi128XY permute result, else use vi128ZW
1940                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1941
1942                     if (bFP)
1943                     {
1944                         // extract 128 bit lanes to sign extend each component
1945                         vVertexElements[currentVertexElement] =
1946                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1947                     }
1948                     else
1949                     {
1950                         // extract 128 bit lanes to sign extend each component
1951                         vVertexElements[currentVertexElement] =
1952                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1953
1954                         // denormalize if needed
1955                         if (conversionType != CONVERT_NONE)
1956                         {
1957                             vVertexElements[currentVertexElement] =
1958                                 FMUL(CAST(IntToFpCast,
1959                                           vVertexElements[currentVertexElement],
1960                                           mSimdFP32Ty),
1961                                      conversionFactor);
1962                         }
1963                     }
1964                     currentVertexElement++;
1965                 }
1966                 else
1967                 {
1968                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1969                 }
1970
1971                 if (currentVertexElement > 3)
1972                 {
1973                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1974                     // reset to the next vVertexElement to output
1975                     currentVertexElement = 0;
1976                 }
1977             }
1978         }
1979     }
1980     // else zero extend
1981     else if ((extendType == Instruction::CastOps::ZExt) ||
1982              (extendType == Instruction::CastOps::UIToFP))
1983     {
1984         // pshufb masks for each component
1985         Value* vConstMask[2];
1986         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1987         {
1988             // x/z shuffle mask
1989             vConstMask[0] = C<char>({
1990                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1991                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1992             });
1993         }
1994
1995         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1996         {
1997             // y/w shuffle mask
1998             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1999                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2000         }
2001
2002         // init denormalize variables if needed
2003         Instruction::CastOps fpCast;
2004         Value*               conversionFactor;
2005
2006         switch (conversionType)
2007         {
2008         case CONVERT_NORMALIZED:
2009             fpCast           = Instruction::CastOps::UIToFP;
2010             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2011             break;
2012         case CONVERT_USCALED:
2013             fpCast           = Instruction::CastOps::UIToFP;
2014             conversionFactor = VIMMED1((float)(1.0f));
2015             break;
2016         case CONVERT_SSCALED:
2017             SWR_INVALID("Type should not be zero extended!");
2018             conversionFactor = nullptr;
2019             break;
2020         default:
2021             SWR_ASSERT(conversionType == CONVERT_NONE);
2022             conversionFactor = nullptr;
2023             break;
2024         }
2025
2026         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2027         for (uint32_t i = 0; i < 4; i++)
2028         {
2029             if (isComponentEnabled(compMask, i))
2030             {
2031                 if (compCtrl[i] == ComponentControl::StoreSrc)
2032                 {
2033                     // select correct constMask for x/z or y/w pshufb
2034                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2035                     // if x or y, use vi128XY permute result, else use vi128ZW
2036                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2037
2038                     vVertexElements[currentVertexElement] =
2039                         BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2040                                        vConstMask[selectedMask]),
2041                                 vGatherTy);
2042                     // after pshufb mask for x channel; z uses the same shuffle from the second
2043                     // gather 256i - 0    1    2    3    4    5    6    7
2044                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2045
2046                     // denormalize if needed
2047                     if (conversionType != CONVERT_NONE)
2048                     {
2049                         vVertexElements[currentVertexElement] =
2050                             FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2051                                  conversionFactor);
2052                     }
2053                     currentVertexElement++;
2054                 }
2055                 else
2056                 {
2057                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2058                 }
2059
2060                 if (currentVertexElement > 3)
2061                 {
2062                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2063                     // reset to the next vVertexElement to output
2064                     currentVertexElement = 0;
2065                 }
2066             }
2067         }
2068     }
2069     else
2070     {
2071         SWR_INVALID("Unsupported conversion type");
2072     }
2073 }
2074
2075 //////////////////////////////////////////////////////////////////////////
2076 /// @brief Output a simdvertex worth of elements to the current outputElt
2077 /// @param pVtxOut - base address of VIN output struct
2078 /// @param outputElt - simdvertex offset in VIN to write to
2079 /// @param numEltsToStore - number of simdvertex rows to write out
2080 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2081 void FetchJit::StoreVertexElements(Value*         pVtxOut,
2082                                    const uint32_t outputElt,
2083                                    const uint32_t numEltsToStore,
2084                                    Value* (&vVertexElements)[4])
2085 {
2086     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2087
2088     for (uint32_t c = 0; c < numEltsToStore; ++c)
2089     {
2090         // STORE expects FP32 x vWidth type, just bitcast if needed
2091         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2092         {
2093 #if FETCH_DUMP_VERTEX
2094             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2095 #endif
2096             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2097         }
2098 #if FETCH_DUMP_VERTEX
2099         else
2100         {
2101             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2102         }
2103 #endif
2104         // outputElt * 4 = offsetting by the size of a simdvertex
2105         // + c offsets to a 32bit x vWidth row within the current vertex
2106         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2107         STORE(vVertexElements[c], dest);
2108     }
2109 }
2110
2111 //////////////////////////////////////////////////////////////////////////
2112 /// @brief Generates a constant vector of values based on the
2113 /// ComponentControl value
2114 /// @param ctrl - ComponentControl value
2115 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2116 {
2117     switch (ctrl)
2118     {
2119     case NoStore:
2120         return VUNDEF_I();
2121     case Store0:
2122         return VIMMED1(0);
2123     case Store1Fp:
2124         return VIMMED1(1.0f);
2125     case Store1Int:
2126         return VIMMED1(1);
2127     case StoreVertexId:
2128     {
2129         if (mVWidth == 16)
2130         {
2131             Type*  pSimd8FPTy = VectorType::get(mFP32Ty, 8);
2132             Value* pIdLo =
2133                 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2134             Value* pIdHi =
2135                 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2136             return JOIN_16(pIdLo, pIdHi);
2137         }
2138         else
2139         {
2140             return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2141         }
2142     }
2143     case StoreInstanceId:
2144     {
2145         Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2146         return VBROADCAST(pId);
2147     }
2148
2149
2150     case StoreSrc:
2151     default:
2152         SWR_INVALID("Invalid component control");
2153         return VUNDEF_I();
2154     }
2155 }
2156
2157 //////////////////////////////////////////////////////////////////////////
2158 /// @brief Returns the enable mask for the specified component.
2159 /// @param enableMask - enable bits
2160 /// @param component - component to check if enabled.
2161 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2162 {
2163     switch (component)
2164     {
2165         // X
2166     case 0:
2167         return (enableMask & ComponentEnable::X);
2168         // Y
2169     case 1:
2170         return (enableMask & ComponentEnable::Y);
2171         // Z
2172     case 2:
2173         return (enableMask & ComponentEnable::Z);
2174         // W
2175     case 3:
2176         return (enableMask & ComponentEnable::W);
2177
2178     default:
2179         return false;
2180     }
2181 }
2182
2183 // Don't want two threads compiling the same fetch shader simultaneously
2184 // Has problems in the JIT cache implementation
2185 // This is only a problem for fetch right now.
2186 static std::mutex gFetchCodegenMutex;
2187
2188 //////////////////////////////////////////////////////////////////////////
2189 /// @brief JITs from fetch shader IR
2190 /// @param hJitMgr - JitManager handle
2191 /// @param func   - LLVM function IR
2192 /// @return PFN_FETCH_FUNC - pointer to fetch code
2193 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2194 {
2195     const llvm::Function* func    = (const llvm::Function*)hFunc;
2196     JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2197     PFN_FETCH_FUNC        pfnFetch;
2198
2199     gFetchCodegenMutex.lock();
2200     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2201     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2202     // add new IR to the module
2203     pJitMgr->mIsModuleFinalized = true;
2204
2205 #if defined(KNOB_SWRC_TRACING)
2206     char        fName[1024];
2207     const char* funcName = func->getName().data();
2208     sprintf(fName, "%s.bin", funcName);
2209     FILE* fd = fopen(fName, "wb");
2210     fwrite((void*)pfnFetch, 1, 2048, fd);
2211     fclose(fd);
2212 #endif
2213
2214     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2215     gFetchCodegenMutex.unlock();
2216
2217
2218     return pfnFetch;
2219 }
2220
2221 //////////////////////////////////////////////////////////////////////////
2222 /// @brief JIT compiles fetch shader
2223 /// @param hJitMgr - JitManager handle
2224 /// @param state   - fetch state to build function from
2225 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2226 {
2227     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2228
2229     pJitMgr->SetupNewModule();
2230
2231     FetchJit theJit(pJitMgr);
2232     HANDLE   hFunc = theJit.Create(state);
2233
2234     return JitFetchFunc(hJitMgr, hFunc);
2235 }