src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file fetch_jit.cpp
  24  *
  25  * @brief Implementation of the fetch jitter
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder_gfx_mem.h"
  32 #include "jit_api.h"
  33 #include "fetch_jit.h"
  34 #include "gen_state_llvm.h"
  35 #include "functionpasses/passes.h"
  36
  37 //#define FETCH_DUMP_VERTEX 1
  38 using namespace llvm;
  39 using namespace SwrJit;
  40
  41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
  42
  43 enum ConversionType
  44 {
  45     CONVERT_NONE,
  46     CONVERT_NORMALIZED,
  47     CONVERT_USCALED,
  48     CONVERT_SSCALED,
  49     CONVERT_SFIXED,
  50 };
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// Interface to Jitting a fetch shader
  54 //////////////////////////////////////////////////////////////////////////
  55 struct FetchJit : public BuilderGfxMem
  56 {
  57     FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {}
  58
  59     Function* Create(const FETCH_COMPILE_STATE& fetchState);
  60
  61     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
  62     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
  63     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
  64     template <typename T>
  65     Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
  66
  67     // package up Shuffle*bpcGatherd args into a tuple for convenience
  68     typedef std::tuple<Value*&,
  69                        Value*,
  70                        const Instruction::CastOps,
  71                        const ConversionType,
  72                        uint32_t&,
  73                        uint32_t&,
  74                        const ComponentEnable,
  75                        const ComponentControl (&)[4],
  76                        Value* (&)[4],
  77                        const uint32_t (&)[4]>
  78         Shuffle8bpcArgs;
  79
  80     void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
  81     void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
  82
  83     typedef std::tuple<Value* (&)[2],
  84                        Value*,
  85                        const Instruction::CastOps,
  86                        const ConversionType,
  87                        uint32_t&,
  88                        uint32_t&,
  89                        const ComponentEnable,
  90                        const ComponentControl (&)[4],
  91                        Value* (&)[4]>
  92         Shuffle16bpcArgs;
  93
  94     void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
  95     void Shuffle16bpcGather(Shuffle16bpcArgs& args);
  96
  97     void StoreVertexElements(Value*         pVtxOut,
  98                              const uint32_t outputElt,
  99                              const uint32_t numEltsToStore,
 100                              Value* (&vVertexElements)[4]);
 101
 102     Value* GenerateCompCtrlVector(const ComponentControl ctrl);
 103
 104     void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
 105                            Value*                     streams,
 106                            Value*                     vIndices,
 107                            Value*                     pVtxOut);
 108
 109     bool IsOddFormat(SWR_FORMAT format);
 110     bool IsUniformFormat(SWR_FORMAT format);
 111     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
 112     void CreateGatherOddFormats(
 113         SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
 114     void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
 115
 116     Value* mpWorkerData;
 117     Value* mpFetchInfo;
 118 };
 119
 120 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 121 {
 122     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 123     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 124
 125     Function* fetch = Function::Create(
 126         JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 127     BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 128
 129     fetch->getParent()->setModuleIdentifier(fetch->getName());
 130
 131     IRB()->SetInsertPoint(entry);
 132
 133     auto argitr = fetch->arg_begin();
 134
 135     // Fetch shader arguments
 136     Value* privateContext = &*argitr;
 137     ++argitr;
 138     privateContext->setName("privateContext");
 139     SetPrivateContext(privateContext);
 140
 141     mpWorkerData = &*argitr;
 142     ++argitr;
 143     mpWorkerData->setName("pWorkerData");
 144     mpFetchInfo = &*argitr;
 145     ++argitr;
 146     mpFetchInfo->setName("fetchInfo");
 147     Value* pVtxOut = &*argitr;
 148     pVtxOut->setName("vtxOutput");
 149
 150     uint32_t baseWidth = mVWidth;
 151
 152     SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
 153
 154     // Override builder target width to force 16-wide SIMD
 155 #if USE_SIMD16_SHADERS
 156     SetTargetWidth(16);
 157 #endif
 158
 159     pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
 160
 161     // SWR_FETCH_CONTEXT::pStreams
 162     Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
 163     streams->setName("pStreams");
 164
 165     // SWR_FETCH_CONTEXT::pIndices
 166     Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
 167     indices->setName("pIndices");
 168
 169     // SWR_FETCH_CONTEXT::pLastIndex
 170     Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
 171     pLastIndex->setName("pLastIndex");
 172
 173     Value* vIndices;
 174     switch (fetchState.indexType)
 175     {
 176     case R8_UINT:
 177         indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
 178         if (fetchState.bDisableIndexOOBCheck)
 179         {
 180             vIndices = LOAD(
 181                 BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)),
 182                 {(uint32_t)0});
 183             vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 184         }
 185         else
 186         {
 187             vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
 188         }
 189         break;
 190     case R16_UINT:
 191         if (fetchState.bDisableIndexOOBCheck)
 192         {
 193             vIndices = LOAD(
 194                 BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)),
 195                 {(uint32_t)0});
 196             vIndices = Z_EXT(vIndices, mSimdInt32Ty);
 197         }
 198         else
 199         {
 200             vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
 201         }
 202         break;
 203     case R32_UINT:
 204         (fetchState.bDisableIndexOOBCheck)
 205             ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
 206             : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
 207         break; // incoming type is already 32bit int
 208     default:
 209         SWR_INVALID("Unsupported index type");
 210         vIndices = nullptr;
 211         break;
 212     }
 213
 214     if (fetchState.bForceSequentialAccessEnable)
 215     {
 216         Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
 217                                        : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 218
 219         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
 220         vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
 221         vIndices = ADD(vIndices, pOffsets);
 222     }
 223
 224     Value* vVertexId = vIndices;
 225     if (fetchState.bVertexIDOffsetEnable)
 226     {
 227         // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
 228         // correct
 229         Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 230         Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
 231         vVertexId           = ADD(vIndices, vBaseVertex);
 232         vVertexId           = ADD(vVertexId, vStartVertex);
 233     }
 234
 235     // store out vertex IDs
 236     if (mVWidth == 16)
 237     {
 238         // store out in simd8 halves until core supports 16-wide natively
 239         auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
 240         auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
 241         STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
 242         STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
 243     }
 244     else if (mVWidth == 8)
 245     {
 246         STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
 247     }
 248
 249     // store out cut mask if enabled
 250     if (fetchState.bEnableCutIndex)
 251     {
 252         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
 253         Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
 254
 255         if (mVWidth == 16)
 256         {
 257             auto cutMaskLo = EXTRACT_16(cutMask, 0);
 258             auto cutMaskHi = EXTRACT_16(cutMask, 1);
 259             STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
 260             STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
 261         }
 262         else if (mVWidth == 8)
 263         {
 264             STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
 265         }
 266     }
 267
 268     // Fetch attributes from memory and output to a simdvertex struct
 269     JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
 270
 271     RET_VOID();
 272
 273     JitManager::DumpToFile(fetch, "src");
 274
 275 #if defined(_DEBUG)
 276     verifyFunction(*fetch);
 277 #endif
 278
 279     ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
 280
 281     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
 282     setupPasses.add(createBreakCriticalEdgesPass());
 283     setupPasses.add(createCFGSimplificationPass());
 284     setupPasses.add(createEarlyCSEPass());
 285     setupPasses.add(createPromoteMemoryToRegisterPass());
 286
 287     setupPasses.run(*fetch);
 288
 289     JitManager::DumpToFile(fetch, "se");
 290
 291     ::FunctionPassManager optPasses(JM()->mpCurrentModule);
 292
 293     ///@todo Haven't touched these either. Need to remove some of these and add others.
 294     optPasses.add(createCFGSimplificationPass());
 295     optPasses.add(createEarlyCSEPass());
 296     optPasses.add(createInstructionCombiningPass());
 297     optPasses.add(createInstructionSimplifierPass());
 298     optPasses.add(createConstantPropagationPass());
 299     optPasses.add(createSCCPPass());
 300     optPasses.add(createAggressiveDCEPass());
 301
 302     optPasses.run(*fetch);
 303
 304     optPasses.add(createLowerX86Pass(this));
 305     optPasses.run(*fetch);
 306
 307     JitManager::DumpToFile(fetch, "opt");
 308
 309
 310     // Revert 16-wide override
 311 #if USE_SIMD16_SHADERS
 312     SetTargetWidth(baseWidth);
 313 #endif
 314
 315     return fetch;
 316 }
 317
 318 // returns true for odd formats that require special state.gather handling
 319 bool FetchJit::IsOddFormat(SWR_FORMAT format)
 320 {
 321     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 322     if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
 323     {
 324         return true;
 325     }
 326     return false;
 327 }
 328
 329 // format is uniform if all components are the same size and type
 330 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 331 {
 332     const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
 333     uint32_t               bpc0  = info.bpc[0];
 334     uint32_t               type0 = info.type[0];
 335
 336     for (uint32_t c = 1; c < info.numComps; ++c)
 337     {
 338         if (bpc0 != info.bpc[c] || type0 != info.type[c])
 339         {
 340             return false;
 341         }
 342     }
 343     return true;
 344 }
 345
 346 // unpacks components based on format
 347 // foreach component in the pixel
 348 //   mask off everything but this component
 349 //   shift component to LSB
 350 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
 351 {
 352     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 353
 354     uint32_t bitOffset = 0;
 355     for (uint32_t c = 0; c < info.numComps; ++c)
 356     {
 357         uint32_t swizzledIndex = info.swizzle[c];
 358         uint32_t compBits      = info.bpc[c];
 359         uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
 360         Value*   comp          = AND(vInput, bitmask);
 361         comp                   = LSHR(comp, bitOffset);
 362
 363         result[swizzledIndex] = comp;
 364         bitOffset += compBits;
 365     }
 366 }
 367
 368 // gather for odd component size formats
 369 // gather SIMD full pixels per lane then shift/mask to move each component to their
 370 // own vector
 371 void FetchJit::CreateGatherOddFormats(
 372     SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 373 {
 374     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 375
 376     // only works if pixel size is <= 32bits
 377     SWR_ASSERT(info.bpp <= 32);
 378
 379     Value* pGather;
 380     if (info.bpp == 32)
 381     {
 382         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 383     }
 384     else
 385     {
 386         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
 387         Value* pMem = ALLOCA(mSimdInt32Ty);
 388         STORE(VIMMED1(0u), pMem);
 389
 390         pBase          = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
 391         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
 392
 393         for (uint32_t lane = 0; lane < mVWidth; ++lane)
 394         {
 395             // Get index
 396             Value* index = VEXTRACT(pOffsets, C(lane));
 397             Value* mask  = VEXTRACT(pMask, C(lane));
 398             switch (info.bpp)
 399             {
 400             case 8:
 401             {
 402                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
 403                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
 404                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 405                 break;
 406             }
 407
 408             case 16:
 409             {
 410                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 411                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 412                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 413                 break;
 414             }
 415             break;
 416
 417             case 24:
 418             {
 419                 // First 16-bits of data
 420                 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
 421                 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
 422                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 423
 424                 // Last 8-bits of data
 425                 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
 426                 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
 427                 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
 428                 break;
 429             }
 430
 431             default:
 432                 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
 433                 break;
 434             }
 435         }
 436
 437         pGather = LOAD(pMem);
 438     }
 439
 440     for (uint32_t comp = 0; comp < 4; ++comp)
 441     {
 442         pResult[comp] = VIMMED1((int)info.defaults[comp]);
 443     }
 444
 445     UnpackComponents(format, pGather, pResult);
 446
 447     // cast to fp32
 448     pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
 449     pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
 450     pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
 451     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 452 }
 453
 454 void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
 455 {
 456     const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 457
 458     for (uint32_t c = 0; c < info.numComps; ++c)
 459     {
 460         uint32_t compIndex = info.swizzle[c];
 461
 462         // skip any conversion on UNUSED components
 463         if (info.type[c] == SWR_TYPE_UNUSED)
 464         {
 465             continue;
 466         }
 467
 468         if (info.isNormalized[c])
 469         {
 470             if (info.type[c] == SWR_TYPE_SNORM)
 471             {
 472                 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
 473                 /// -1.0f.
 474
 475                 /// result = c * (1.0f / (2^(n-1) - 1);
 476                 uint32_t n        = info.bpc[c];
 477                 uint32_t pow2     = 1 << (n - 1);
 478                 float    scale    = 1.0f / (float)(pow2 - 1);
 479                 Value*   vScale   = VIMMED1(scale);
 480                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 481                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 482                 texels[compIndex] = FMUL(texels[compIndex], vScale);
 483             }
 484             else
 485             {
 486                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 487
 488                 /// result = c * (1.0f / (2^n - 1))
 489                 uint32_t n    = info.bpc[c];
 490                 uint32_t pow2 = 1 << n;
 491                 // special case 24bit unorm format, which requires a full divide to meet ULP
 492                 // requirement
 493                 if (n == 24)
 494                 {
 495                     float  scale      = (float)(pow2 - 1);
 496                     Value* vScale     = VIMMED1(scale);
 497                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 498                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
 499                     texels[compIndex] = FDIV(texels[compIndex], vScale);
 500                 }
 501                 else
 502                 {
 503                     float  scale      = 1.0f / (float)(pow2 - 1);
 504                     Value* vScale     = VIMMED1(scale);
 505                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
 506                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
 507                     texels[compIndex] = FMUL(texels[compIndex], vScale);
 508                 }
 509             }
 510             continue;
 511         }
 512     }
 513 }
 514
 515 //////////////////////////////////////////////////////////////////////////
 516 /// @brief Loads attributes from memory using AVX2 GATHER(s)
 517 /// @param fetchState - info about attributes to be fetched from memory
 518 /// @param streams - value pointer to the current vertex stream
 519 /// @param vIndices - vector value of indices to gather
 520 /// @param pVtxOut - value pointer to output simdvertex struct
 521 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
 522                                  Value*                     streams,
 523                                  Value*                     vIndices,
 524                                  Value*                     pVtxOut)
 525 {
 526     uint32_t currentVertexElement = 0;
 527     uint32_t outputElt            = 0;
 528     Value*   vVertexElements[4];
 529
 530     Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 531     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
 532     Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 533     Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
 534     curInstance->setName("curInstance");
 535
 536     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
 537     {
 538         const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 539
 540         // skip element if all components are disabled
 541         if (ied.ComponentPacking == ComponentEnable::NONE)
 542         {
 543             continue;
 544         }
 545
 546         const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
 547         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
 548         uint32_t bpc =
 549             info.bpp /
 550             info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
 551
 552         Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 553
 554         // VGATHER* takes an *i8 src pointer
 555         Value* pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
 556
 557         Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
 558         Value* vStride = VBROADCAST(stride);
 559
 560         // max vertex index that is fully in bounds
 561         Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
 562         maxVertex        = LOAD(maxVertex);
 563
 564         Value* minVertex = NULL;
 565         if (fetchState.bPartialVertexBuffer)
 566         {
 567             // min vertex index for low bounds OOB checking
 568             minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 569             minVertex = LOAD(minVertex);
 570         }
 571
 572         if (fetchState.bInstanceIDOffsetEnable)
 573         {
 574             // the InstanceID (curInstance) value is offset by StartInstanceLocation
 575             curInstance = ADD(curInstance, startInstance);
 576         }
 577
 578         Value* vCurIndices;
 579         Value* startOffset;
 580         Value* vInstanceStride = VIMMED1(0);
 581
 582         if (ied.InstanceEnable)
 583         {
 584             Value* stepRate = C(ied.InstanceAdvancementState);
 585
 586             // prevent a div by 0 for 0 step rate
 587             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
 588             stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
 589
 590             // calc the current offset into instanced data buffer
 591             Value* calcInstance = UDIV(curInstance, stepRate);
 592
 593             // if step rate is 0, every instance gets instance 0
 594             calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 595
 596             vCurIndices = VBROADCAST(calcInstance);
 597             startOffset = startInstance;
 598         }
 599         else if (ied.InstanceStrideEnable)
 600         {
 601             // grab the instance advancement state, determines stride in bytes from one instance to
 602             // the next
 603             Value* stepRate = C(ied.InstanceAdvancementState);
 604             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 605
 606             // offset indices by baseVertex
 607             vCurIndices = ADD(vIndices, vBaseVertex);
 608
 609             startOffset = startVertex;
 610             SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
 611         }
 612         else
 613         {
 614             // offset indices by baseVertex
 615             vCurIndices = ADD(vIndices, vBaseVertex);
 616             startOffset = startVertex;
 617         }
 618
 619         // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
 620         // do 64bit address offset calculations.
 621
 622         // calculate byte offset to the start of the VB
 623         Value* baseOffset     = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
 624         pStreamBase           = GEP(pStreamBase, baseOffset);
 625         Value* pStreamBaseGFX = ADD(stream, baseOffset);
 626
 627         // if we have a start offset, subtract from max vertex. Used for OOB check
 628         maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 629         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
 630         // if we have a negative value, we're already OOB. clamp at 0.
 631         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
 632
 633         if (fetchState.bPartialVertexBuffer)
 634         {
 635             // similary for min vertex
 636             minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
 637             Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
 638             minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
 639         }
 640
 641         // Load the in bounds size of a partially valid vertex
 642         Value* partialInboundsSize =
 643             GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
 644         partialInboundsSize       = LOAD(partialInboundsSize);
 645         Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
 646         Value* vBpp               = VBROADCAST(C(info.Bpp));
 647         Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
 648
 649         // is the element is <= the partially valid size
 650         Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 651
 652         // override cur indices with 0 if pitch is 0
 653         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
 654         vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 655
 656         // are vertices partially OOB?
 657         Value* vMaxVertex      = VBROADCAST(maxVertex);
 658         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 659
 660         // are vertices fully in bounds?
 661         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 662
 663         Value* vGatherMask;
 664         if (fetchState.bPartialVertexBuffer)
 665         {
 666             // are vertices below minVertex limit?
 667             Value* vMinVertex     = VBROADCAST(minVertex);
 668             Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 669
 670             // only fetch lanes that pass both tests
 671             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
 672         }
 673         else
 674         {
 675             vGatherMask = vMaxGatherMask;
 676         }
 677
 678         // blend in any partially OOB indices that have valid elements
 679         vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
 680
 681         // calculate the actual offsets into the VB
 682         Value* vOffsets = MUL(vCurIndices, vStride);
 683         vOffsets        = ADD(vOffsets, vAlignmentOffsets);
 684
 685         // if instance stride enable is:
 686         //  true  - add product of the instanceID and advancement state to the offst into the VB
 687         //  false - value of vInstanceStride has been initialialized to zero
 688         vOffsets = ADD(vOffsets, vInstanceStride);
 689
 690         // Packing and component control
 691         ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
 692         const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
 693                                            (ComponentControl)ied.ComponentControl1,
 694                                            (ComponentControl)ied.ComponentControl2,
 695                                            (ComponentControl)ied.ComponentControl3};
 696
 697         // Special gather/conversion for formats without equal component sizes
 698         if (IsOddFormat((SWR_FORMAT)ied.Format))
 699         {
 700             Value* pResults[4];
 701             CreateGatherOddFormats(
 702                 (SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
 703             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 704
 705             for (uint32_t c = 0; c < 4; c += 1)
 706             {
 707                 if (isComponentEnabled(compMask, c))
 708                 {
 709                     vVertexElements[currentVertexElement++] = pResults[c];
 710                     if (currentVertexElement > 3)
 711                     {
 712                         StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 713                         // reset to the next vVertexElement to output
 714                         currentVertexElement = 0;
 715                     }
 716                 }
 717             }
 718         }
 719         else if (info.type[0] == SWR_TYPE_FLOAT)
 720         {
 721             ///@todo: support 64 bit vb accesses
 722             Value* gatherSrc = VIMMED1(0.0f);
 723
 724             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 725                        "Unsupported format for standard gather fetch.");
 726
 727             // Gather components from memory to store in a simdvertex structure
 728             switch (bpc)
 729             {
 730             case 16:
 731             {
 732                 Value* vGatherResult[2];
 733
 734                 // if we have at least one component out of x or y to fetch
 735                 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 736                 {
 737                     vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 738                     // e.g. result of first 8x32bit integer gather for 16bit components
 739                     // 256i - 0    1    2    3    4    5    6    7
 740                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 741                     //
 742                 }
 743
 744                 // if we have at least one component out of z or w to fetch
 745                 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 746                 {
 747                     // offset base to the next components(zw) in the vertex to gather
 748                     pStreamBase = GEP(pStreamBase, C((char)4));
 749
 750                     vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 751                     // e.g. result of second 8x32bit integer gather for 16bit components
 752                     // 256i - 0    1    2    3    4    5    6    7
 753                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 754                     //
 755                 }
 756
 757                 // if we have at least one component to shuffle into place
 758                 if (compMask)
 759                 {
 760                     Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
 761                                                                   pVtxOut,
 762                                                                   Instruction::CastOps::FPExt,
 763                                                                   CONVERT_NONE,
 764                                                                   currentVertexElement,
 765                                                                   outputElt,
 766                                                                   compMask,
 767                                                                   compCtrl,
 768                                                                   vVertexElements);
 769
 770                     // Shuffle gathered components into place in simdvertex struct
 771                     mVWidth == 16 ? Shuffle16bpcGather16(args)
 772                                   : Shuffle16bpcGather(args); // outputs to vVertexElements ref
 773                 }
 774             }
 775             break;
 776             case 32:
 777             {
 778                 for (uint32_t i = 0; i < 4; i += 1)
 779                 {
 780                     if (isComponentEnabled(compMask, i))
 781                     {
 782                         // if we need to gather the component
 783                         if (compCtrl[i] == StoreSrc)
 784                         {
 785                             // Gather a SIMD of vertices
 786                             // APIs allow a 4GB range for offsets
 787                             // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
 788                             // But, we know that elements must be aligned for FETCH. :)
 789                             // Right shift the offset by a bit and then scale by 2 to remove the
 790                             // sign extension.
 791                             Value* vShiftedOffsets = LSHR(vOffsets, 1);
 792                             vVertexElements[currentVertexElement++] =
 793                                 GATHERPS(gatherSrc,
 794                                          pStreamBaseGFX,
 795                                          vShiftedOffsets,
 796                                          vGatherMask,
 797                                          2,
 798                                          GFX_MEM_CLIENT_FETCH);
 799                         }
 800                         else
 801                         {
 802                             vVertexElements[currentVertexElement++] =
 803                                 GenerateCompCtrlVector(compCtrl[i]);
 804                         }
 805
 806                         if (currentVertexElement > 3)
 807                         {
 808                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 809                             // reset to the next vVertexElement to output
 810                             currentVertexElement = 0;
 811                         }
 812                     }
 813
 814                     // offset base to the next component in the vertex to gather
 815                     pStreamBase    = GEP(pStreamBase, C((char)4));
 816                     pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
 817                 }
 818             }
 819             break;
 820             case 64:
 821             {
 822                 for (uint32_t i = 0; i < 4; i += 1)
 823                 {
 824                     if (isComponentEnabled(compMask, i))
 825                     {
 826                         // if we need to gather the component
 827                         if (compCtrl[i] == StoreSrc)
 828                         {
 829                             Value* vShufLo;
 830                             Value* vShufHi;
 831                             Value* vShufAll;
 832
 833                             if (mVWidth == 8)
 834                             {
 835                                 vShufLo  = C({0, 1, 2, 3});
 836                                 vShufHi  = C({4, 5, 6, 7});
 837                                 vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
 838                             }
 839                             else
 840                             {
 841                                 SWR_ASSERT(mVWidth == 16);
 842                                 vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
 843                                 vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
 844                                 vShufAll =
 845                                     C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 846                             }
 847
 848                             Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
 849                             Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
 850
 851                             Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
 852                             Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
 853
 854                             Value* vZeroDouble = VECTOR_SPLAT(
 855                                 mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 856
 857                             Value* pGatherLo =
 858                                 GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
 859                             Value* pGatherHi =
 860                                 GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
 861
 862                             pGatherLo = VCVTPD2PS(pGatherLo);
 863                             pGatherHi = VCVTPD2PS(pGatherHi);
 864
 865                             Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
 866
 867                             vVertexElements[currentVertexElement++] = pGather;
 868                         }
 869                         else
 870                         {
 871                             vVertexElements[currentVertexElement++] =
 872                                 GenerateCompCtrlVector(compCtrl[i]);
 873                         }
 874
 875                         if (currentVertexElement > 3)
 876                         {
 877                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
 878                             // reset to the next vVertexElement to output
 879                             currentVertexElement = 0;
 880                         }
 881                     }
 882
 883                     // offset base to the next component  in the vertex to gather
 884                     pStreamBase = GEP(pStreamBase, C((char)8));
 885                 }
 886             }
 887             break;
 888             default:
 889                 SWR_INVALID("Tried to fetch invalid FP format");
 890                 break;
 891             }
 892         }
 893         else
 894         {
 895             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
 896             ConversionType       conversionType = CONVERT_NONE;
 897
 898             SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
 899                        "Unsupported format for standard gather fetch.");
 900
 901             switch (info.type[0])
 902             {
 903             case SWR_TYPE_UNORM:
 904                 conversionType = CONVERT_NORMALIZED;
 905             case SWR_TYPE_UINT:
 906                 extendCastType = Instruction::CastOps::ZExt;
 907                 break;
 908             case SWR_TYPE_SNORM:
 909                 conversionType = CONVERT_NORMALIZED;
 910             case SWR_TYPE_SINT:
 911                 extendCastType = Instruction::CastOps::SExt;
 912                 break;
 913             case SWR_TYPE_USCALED:
 914                 conversionType = CONVERT_USCALED;
 915                 extendCastType = Instruction::CastOps::UIToFP;
 916                 break;
 917             case SWR_TYPE_SSCALED:
 918                 conversionType = CONVERT_SSCALED;
 919                 extendCastType = Instruction::CastOps::SIToFP;
 920                 break;
 921             case SWR_TYPE_SFIXED:
 922                 conversionType = CONVERT_SFIXED;
 923                 extendCastType = Instruction::CastOps::SExt;
 924                 break;
 925             default:
 926                 break;
 927             }
 928
 929             // value substituted when component of gather is masked
 930             Value* gatherSrc = VIMMED1(0);
 931
 932             // Gather components from memory to store in a simdvertex structure
 933             switch (bpc)
 934             {
 935             case 8:
 936             {
 937                 // if we have at least one component to fetch
 938                 if (compMask)
 939                 {
 940                     Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 941                     // e.g. result of an 8x32bit integer gather for 8bit components
 942                     // 256i - 0    1    2    3    4    5    6    7
 943                     //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 944
 945                     Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
 946                                                                  pVtxOut,
 947                                                                  extendCastType,
 948                                                                  conversionType,
 949                                                                  currentVertexElement,
 950                                                                  outputElt,
 951                                                                  compMask,
 952                                                                  compCtrl,
 953                                                                  vVertexElements,
 954                                                                  info.swizzle);
 955
 956                     // Shuffle gathered components into place in simdvertex struct
 957                     mVWidth == 16 ? Shuffle8bpcGatherd16(args)
 958                                   : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
 959                 }
 960             }
 961             break;
 962             case 16:
 963             {
 964                 Value* vGatherResult[2];
 965
 966                 // if we have at least one component out of x or y to fetch
 967                 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
 968                 {
 969                     vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 970                     // e.g. result of first 8x32bit integer gather for 16bit components
 971                     // 256i - 0    1    2    3    4    5    6    7
 972                     //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
 973                     //
 974                 }
 975
 976                 // if we have at least one component out of z or w to fetch
 977                 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
 978                 {
 979                     // offset base to the next components(zw) in the vertex to gather
 980                     pStreamBase = GEP(pStreamBase, C((char)4));
 981
 982                     vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
 983                     // e.g. result of second 8x32bit integer gather for 16bit components
 984                     // 256i - 0    1    2    3    4    5    6    7
 985                     //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
 986                     //
 987                 }
 988
 989                 // if we have at least one component to shuffle into place
 990                 if (compMask)
 991                 {
 992                     Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
 993                                                                   pVtxOut,
 994                                                                   extendCastType,
 995                                                                   conversionType,
 996                                                                   currentVertexElement,
 997                                                                   outputElt,
 998                                                                   compMask,
 999                                                                   compCtrl,
1000                                                                   vVertexElements);
1001
1002                     // Shuffle gathered components into place in simdvertex struct
1003                     mVWidth == 16 ? Shuffle16bpcGather16(args)
1004                                   : Shuffle16bpcGather(args); // outputs to vVertexElements ref
1005                 }
1006             }
1007             break;
1008             case 32:
1009             {
1010                 // Gathered components into place in simdvertex struct
1011                 for (uint32_t i = 0; i < 4; i++)
1012                 {
1013                     if (isComponentEnabled(compMask, i))
1014                     {
1015                         // if we need to gather the component
1016                         if (compCtrl[i] == StoreSrc)
1017                         {
1018                             Value* pGather =
1019                                 GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1020
1021                             if (conversionType == CONVERT_USCALED)
1022                             {
1023                                 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1024                             }
1025                             else if (conversionType == CONVERT_SSCALED)
1026                             {
1027                                 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1028                             }
1029                             else if (conversionType == CONVERT_SFIXED)
1030                             {
1031                                 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1032                                                VBROADCAST(C(1 / 65536.0f)));
1033                             }
1034
1035                             vVertexElements[currentVertexElement++] = pGather;
1036
1037                             // e.g. result of a single 8x32bit integer gather for 32bit components
1038                             // 256i - 0    1    2    3    4    5    6    7
1039                             //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1040                         }
1041                         else
1042                         {
1043                             vVertexElements[currentVertexElement++] =
1044                                 GenerateCompCtrlVector(compCtrl[i]);
1045                         }
1046
1047                         if (currentVertexElement > 3)
1048                         {
1049                             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1050
1051                             // reset to the next vVertexElement to output
1052                             currentVertexElement = 0;
1053                         }
1054                     }
1055
1056                     // offset base to the next component  in the vertex to gather
1057                     pStreamBase = GEP(pStreamBase, C((char)4));
1058                 }
1059             }
1060             break;
1061             }
1062         }
1063     }
1064
1065     // if we have a partially filled vVertexElement struct, output it
1066     if (currentVertexElement > 0)
1067     {
1068         StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1069     }
1070 }
1071
1072 template <typename T>
1073 Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1074 {
1075     SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1076                "Function expects gfxptr_t for both input parameters.");
1077
1078     Type* Ty = nullptr;
1079
1080     static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1081                   "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1082     constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1083     if (bSize)
1084     {
1085         Ty = mInt16PtrTy;
1086     }
1087     else if (sizeof(T) == sizeof(uint8_t))
1088     {
1089         Ty = mInt8PtrTy;
1090     }
1091     else
1092     {
1093         SWR_ASSERT(false, "This should never happen as per static_assert above.");
1094     }
1095
1096     Value* vIndices = VUNDEF_I();
1097
1098     {
1099         // store 0 index on stack to be used to conditionally load from if index address is OOB
1100         Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1101         STORE(C((T)0), pZeroIndex);
1102
1103         // Load a SIMD of index pointers
1104         for (int64_t lane = 0; lane < mVWidth; lane++)
1105         {
1106             // Calculate the address of the requested index
1107             Value* pIndex = GEP(pIndices, C(lane), Ty);
1108
1109             pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1110
1111             // check if the address is less than the max index,
1112             Value* mask = ICMP_ULT(pIndex, pLastIndex);
1113
1114             // if valid, load the index. if not, load 0 from the stack
1115             Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1116             Value* index  = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH);
1117
1118             // zero extended index to 32 bits and insert into the correct simd lane
1119             index    = Z_EXT(index, mInt32Ty);
1120             vIndices = VINSERT(vIndices, index, lane);
1121         }
1122     }
1123
1124     return vIndices;
1125 }
1126
1127 //////////////////////////////////////////////////////////////////////////
1128 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1129 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1130 /// support
1131 /// @param pIndices - pointer to 8 bit indices
1132 /// @param pLastIndex - pointer to last valid index
1133 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1134 {
1135     return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1136 }
1137
1138 //////////////////////////////////////////////////////////////////////////
1139 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1140 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1141 /// support
1142 /// @param pIndices - pointer to 16 bit indices
1143 /// @param pLastIndex - pointer to last valid index
1144 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1145 {
1146     return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1147 }
1148
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1151 /// @param pIndices - pointer to 32 bit indices
1152 /// @param pLastIndex - pointer to last valid index
1153 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1154 {
1155     DataLayout dL(JM()->mpCurrentModule);
1156     Value*     iLastIndex = pLastIndex;
1157     Value*     iIndices   = pIndices;
1158
1159     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1160     Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1161     numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
1162     numIndicesLeft        = SDIV(numIndicesLeft, C(4));
1163
1164     // create a vector of index counts from the base index ptr passed into the fetch
1165     Constant* vIndexOffsets;
1166     if (mVWidth == 8)
1167     {
1168         vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1169     }
1170     else
1171     {
1172         vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1173     }
1174
1175     // compare index count to the max valid index
1176     // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1177     //     vIndexOffsets  0 1 2 3 4 5 6 7
1178     //     ------------------------------
1179     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1180     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1181     Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
1182     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1183
1184     // Load the indices; OOB loads 0
1185     return MASKED_LOAD(pIndices,
1186                        4,
1187                        vIndexMask,
1188                        VIMMED1(0),
1189                        "vIndices",
1190                        PointerType::get(mSimdInt32Ty, 0),
1191                        GFX_MEM_CLIENT_FETCH);
1192 }
1193
1194 //////////////////////////////////////////////////////////////////////////
1195 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1196 /// denormalizes if needed, converts to F32 if needed, and positions in
1197 //  the proper SIMD rows to be output to the simdvertex structure
1198 /// @param args: (tuple of args, listed below)
1199 ///   @param vGatherResult - 8 gathered 8bpc vertices
1200 ///   @param pVtxOut - base pointer to output simdvertex struct
1201 ///   @param extendType - sign extend or zero extend
1202 ///   @param bNormalized - do we need to denormalize?
1203 ///   @param currentVertexElement - reference to the current vVertexElement
1204 ///   @param outputElt - reference to the current offset from simdvertex we're o
1205 ///   @param compMask - component packing mask
1206 ///   @param compCtrl - component control val
1207 ///   @param vVertexElements[4] - vertex components to output
1208 ///   @param swizzle[4] - component swizzle location
1209 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1210 {
1211     // Unpack tuple args
1212     Value*&                    vGatherResult        = std::get<0>(args);
1213     Value*                     pVtxOut              = std::get<1>(args);
1214     const Instruction::CastOps extendType           = std::get<2>(args);
1215     const ConversionType       conversionType       = std::get<3>(args);
1216     uint32_t&                  currentVertexElement = std::get<4>(args);
1217     uint32_t&                  outputElt            = std::get<5>(args);
1218     const ComponentEnable      compMask             = std::get<6>(args);
1219     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1220     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1221     const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1222
1223     // cast types
1224     Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1225     Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1226
1227     // have to do extra work for sign extending
1228     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1229     {
1230         Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1231         Type* v128Ty  = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1232
1233         // shuffle mask, including any swizzling
1234         const char x          = (char)swizzle[0];
1235         const char y          = (char)swizzle[1];
1236         const char z          = (char)swizzle[2];
1237         const char w          = (char)swizzle[3];
1238         Value*     vConstMask = C<char>(
1239             {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
1240              char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
1241              char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
1242              char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
1243              char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
1244              char(w + 8), char(w + 12)});
1245
1246         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1247
1248         Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1249         Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1250
1251         Value* vShufResult_lo =
1252             BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1253         Value* vShufResult_hi =
1254             BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1255
1256         // after pshufb: group components together in each 128bit lane
1257         // 256i - 0    1    2    3    4    5    6    7
1258         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1259
1260         Value* vi128XY_lo = nullptr;
1261         Value* vi128XY_hi = nullptr;
1262         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1263         {
1264             vi128XY_lo = BITCAST(
1265                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1266                 v128Ty);
1267             vi128XY_hi = BITCAST(
1268                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1269                 v128Ty);
1270
1271             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1272             // 256i - 0    1    2    3    4    5    6    7
1273             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1274         }
1275
1276         // do the same for zw components
1277         Value* vi128ZW_lo = nullptr;
1278         Value* vi128ZW_hi = nullptr;
1279         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1280         {
1281             vi128ZW_lo = BITCAST(
1282                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1283                 v128Ty);
1284             vi128ZW_hi = BITCAST(
1285                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1286                 v128Ty);
1287         }
1288
1289         // init denormalize variables if needed
1290         Instruction::CastOps fpCast;
1291         Value*               conversionFactor;
1292
1293         switch (conversionType)
1294         {
1295         case CONVERT_NORMALIZED:
1296             fpCast           = Instruction::CastOps::SIToFP;
1297             conversionFactor = VIMMED1((float)(1.0 / 127.0));
1298             break;
1299         case CONVERT_SSCALED:
1300             fpCast           = Instruction::CastOps::SIToFP;
1301             conversionFactor = VIMMED1((float)(1.0));
1302             break;
1303         case CONVERT_USCALED:
1304             SWR_INVALID("Type should not be sign extended!");
1305             conversionFactor = nullptr;
1306             break;
1307         default:
1308             SWR_ASSERT(conversionType == CONVERT_NONE);
1309             conversionFactor = nullptr;
1310             break;
1311         }
1312
1313         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1314         // simdvertex
1315         for (uint32_t i = 0; i < 4; i++)
1316         {
1317             if (isComponentEnabled(compMask, i))
1318             {
1319                 if (compCtrl[i] == ComponentControl::StoreSrc)
1320                 {
1321                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1322                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1323                     // if x or y, use vi128XY permute result, else use vi128ZW
1324                     Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1325                     Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1326
1327                     // sign extend
1328                     Value* temp_lo =
1329                         PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1330                     Value* temp_hi =
1331                         PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1332
1333                     Value* temp = JOIN_16(temp_lo, temp_hi);
1334
1335                     // denormalize if needed
1336                     if (conversionType != CONVERT_NONE)
1337                     {
1338                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1339                     }
1340
1341                     vVertexElements[currentVertexElement] = temp;
1342
1343                     currentVertexElement += 1;
1344                 }
1345                 else
1346                 {
1347                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1348                 }
1349
1350                 if (currentVertexElement > 3)
1351                 {
1352                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1353                     // reset to the next vVertexElement to output
1354                     currentVertexElement = 0;
1355                 }
1356             }
1357         }
1358     }
1359     // else zero extend
1360     else if ((extendType == Instruction::CastOps::ZExt) ||
1361              (extendType == Instruction::CastOps::UIToFP))
1362     {
1363         // init denormalize variables if needed
1364         Instruction::CastOps fpCast;
1365         Value*               conversionFactor;
1366
1367         switch (conversionType)
1368         {
1369         case CONVERT_NORMALIZED:
1370             fpCast           = Instruction::CastOps::UIToFP;
1371             conversionFactor = VIMMED1((float)(1.0 / 255.0));
1372             break;
1373         case CONVERT_USCALED:
1374             fpCast           = Instruction::CastOps::UIToFP;
1375             conversionFactor = VIMMED1((float)(1.0));
1376             break;
1377         case CONVERT_SSCALED:
1378             SWR_INVALID("Type should not be zero extended!");
1379             conversionFactor = nullptr;
1380             break;
1381         default:
1382             SWR_ASSERT(conversionType == CONVERT_NONE);
1383             conversionFactor = nullptr;
1384             break;
1385         }
1386
1387         // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1388         for (uint32_t i = 0; i < 4; i++)
1389         {
1390             if (isComponentEnabled(compMask, i))
1391             {
1392                 if (compCtrl[i] == ComponentControl::StoreSrc)
1393                 {
1394                     // pshufb masks for each component
1395                     Value* vConstMask;
1396                     switch (swizzle[i])
1397                     {
1398                     case 0:
1399                         // x shuffle mask
1400                         vConstMask =
1401                             C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1402                                      0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1403                         break;
1404                     case 1:
1405                         // y shuffle mask
1406                         vConstMask =
1407                             C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1408                                      1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1409                         break;
1410                     case 2:
1411                         // z shuffle mask
1412                         vConstMask =
1413                             C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1414                                      2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1415                         break;
1416                     case 3:
1417                         // w shuffle mask
1418                         vConstMask =
1419                             C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1420                                      3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1421                         break;
1422                     default:
1423                         vConstMask = nullptr;
1424                         break;
1425                     }
1426
1427                     Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1428                     Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1429
1430                     Value* temp_lo =
1431                         BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1432                     Value* temp_hi =
1433                         BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1434
1435                     // after pshufb for x channel
1436                     // 256i - 0    1    2    3    4    5    6    7
1437                     //        x000 x000 x000 x000 x000 x000 x000 x000
1438
1439                     Value* temp = JOIN_16(temp_lo, temp_hi);
1440
1441                     // denormalize if needed
1442                     if (conversionType != CONVERT_NONE)
1443                     {
1444                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1445                     }
1446
1447                     vVertexElements[currentVertexElement] = temp;
1448
1449                     currentVertexElement += 1;
1450                 }
1451                 else
1452                 {
1453                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1454                 }
1455
1456                 if (currentVertexElement > 3)
1457                 {
1458                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1459                     // reset to the next vVertexElement to output
1460                     currentVertexElement = 0;
1461                 }
1462             }
1463         }
1464     }
1465     else
1466     {
1467         SWR_INVALID("Unsupported conversion type");
1468     }
1469 }
1470
1471 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1472 {
1473     // Unpack tuple args
1474     Value*&                    vGatherResult        = std::get<0>(args);
1475     Value*                     pVtxOut              = std::get<1>(args);
1476     const Instruction::CastOps extendType           = std::get<2>(args);
1477     const ConversionType       conversionType       = std::get<3>(args);
1478     uint32_t&                  currentVertexElement = std::get<4>(args);
1479     uint32_t&                  outputElt            = std::get<5>(args);
1480     const ComponentEnable      compMask             = std::get<6>(args);
1481     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1482     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1483     const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1484
1485     // cast types
1486     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1487
1488     for (uint32_t i = 0; i < 4; i++)
1489     {
1490         if (!isComponentEnabled(compMask, i))
1491             continue;
1492
1493         if (compCtrl[i] == ComponentControl::StoreSrc)
1494         {
1495             std::vector<uint32_t> vShuffleMasks[4] = {
1496                 {0, 4, 8, 12, 16, 20, 24, 28},  // x
1497                 {1, 5, 9, 13, 17, 21, 25, 29},  // y
1498                 {2, 6, 10, 14, 18, 22, 26, 30}, // z
1499                 {3, 7, 11, 15, 19, 23, 27, 31}, // w
1500             };
1501
1502             Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1503                                   UndefValue::get(v32x8Ty),
1504                                   vShuffleMasks[swizzle[i]]);
1505
1506             if ((extendType == Instruction::CastOps::SExt) ||
1507                 (extendType == Instruction::CastOps::SIToFP))
1508             {
1509                 switch (conversionType)
1510                 {
1511                 case CONVERT_NORMALIZED:
1512                     val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1513                     break;
1514                 case CONVERT_SSCALED:
1515                     val = SI_TO_FP(val, mSimdFP32Ty);
1516                     break;
1517                 case CONVERT_USCALED:
1518                     SWR_INVALID("Type should not be sign extended!");
1519                     break;
1520                 default:
1521                     SWR_ASSERT(conversionType == CONVERT_NONE);
1522                     val = S_EXT(val, mSimdInt32Ty);
1523                     break;
1524                 }
1525             }
1526             else if ((extendType == Instruction::CastOps::ZExt) ||
1527                      (extendType == Instruction::CastOps::UIToFP))
1528             {
1529                 switch (conversionType)
1530                 {
1531                 case CONVERT_NORMALIZED:
1532                     val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1533                     break;
1534                 case CONVERT_SSCALED:
1535                     SWR_INVALID("Type should not be zero extended!");
1536                     break;
1537                 case CONVERT_USCALED:
1538                     val = UI_TO_FP(val, mSimdFP32Ty);
1539                     break;
1540                 default:
1541                     SWR_ASSERT(conversionType == CONVERT_NONE);
1542                     val = Z_EXT(val, mSimdInt32Ty);
1543                     break;
1544                 }
1545             }
1546             else
1547             {
1548                 SWR_INVALID("Unsupported conversion type");
1549             }
1550
1551             vVertexElements[currentVertexElement++] = val;
1552         }
1553         else
1554         {
1555             vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1556         }
1557
1558         if (currentVertexElement > 3)
1559         {
1560             StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1561             // reset to the next vVertexElement to output
1562             currentVertexElement = 0;
1563         }
1564     }
1565 }
1566
1567 //////////////////////////////////////////////////////////////////////////
1568 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1569 /// denormalizes if needed, converts to F32 if needed, and positions in
1570 //  the proper SIMD rows to be output to the simdvertex structure
1571 /// @param args: (tuple of args, listed below)
1572 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1573 ///   @param pVtxOut - base pointer to output simdvertex struct
1574 ///   @param extendType - sign extend or zero extend
1575 ///   @param bNormalized - do we need to denormalize?
1576 ///   @param currentVertexElement - reference to the current vVertexElement
1577 ///   @param outputElt - reference to the current offset from simdvertex we're o
1578 ///   @param compMask - component packing mask
1579 ///   @param compCtrl - component control val
1580 ///   @param vVertexElements[4] - vertex components to output
1581 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1582 {
1583     // Unpack tuple args
1584     Value*(&vGatherResult)[2]                       = std::get<0>(args);
1585     Value*                     pVtxOut              = std::get<1>(args);
1586     const Instruction::CastOps extendType           = std::get<2>(args);
1587     const ConversionType       conversionType       = std::get<3>(args);
1588     uint32_t&                  currentVertexElement = std::get<4>(args);
1589     uint32_t&                  outputElt            = std::get<5>(args);
1590     const ComponentEnable      compMask             = std::get<6>(args);
1591     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1592     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1593
1594     // cast types
1595     Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1596     Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1597
1598     // have to do extra work for sign extending
1599     if ((extendType == Instruction::CastOps::SExt) ||
1600         (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1601     {
1602         // is this PP float?
1603         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1604
1605         Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1606         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1607
1608         // shuffle mask
1609         Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1610                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1611         Value* vi128XY_lo = nullptr;
1612         Value* vi128XY_hi = nullptr;
1613         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1614         {
1615             // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1616             // now..
1617
1618             Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1619             Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1620
1621             Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1622             Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1623
1624             // after pshufb: group components together in each 128bit lane
1625             // 256i - 0    1    2    3    4    5    6    7
1626             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1627
1628             vi128XY_lo = BITCAST(
1629                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1630                 v128bitTy);
1631             vi128XY_hi = BITCAST(
1632                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1633                 v128bitTy);
1634
1635             // after PERMD: move and pack xy components into each 128bit lane
1636             // 256i - 0    1    2    3    4    5    6    7
1637             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1638         }
1639
1640         // do the same for zw components
1641         Value* vi128ZW_lo = nullptr;
1642         Value* vi128ZW_hi = nullptr;
1643         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1644         {
1645             Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1646             Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1647
1648             Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1649             Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1650
1651             vi128ZW_lo = BITCAST(
1652                 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1653                 v128bitTy);
1654             vi128ZW_hi = BITCAST(
1655                 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1656                 v128bitTy);
1657         }
1658
1659         // init denormalize variables if needed
1660         Instruction::CastOps IntToFpCast;
1661         Value*               conversionFactor;
1662
1663         switch (conversionType)
1664         {
1665         case CONVERT_NORMALIZED:
1666             IntToFpCast      = Instruction::CastOps::SIToFP;
1667             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1668             break;
1669         case CONVERT_SSCALED:
1670             IntToFpCast      = Instruction::CastOps::SIToFP;
1671             conversionFactor = VIMMED1((float)(1.0));
1672             break;
1673         case CONVERT_USCALED:
1674             SWR_INVALID("Type should not be sign extended!");
1675             conversionFactor = nullptr;
1676             break;
1677         default:
1678             SWR_ASSERT(conversionType == CONVERT_NONE);
1679             conversionFactor = nullptr;
1680             break;
1681         }
1682
1683         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1684         // simdvertex
1685         for (uint32_t i = 0; i < 4; i++)
1686         {
1687             if (isComponentEnabled(compMask, i))
1688             {
1689                 if (compCtrl[i] == ComponentControl::StoreSrc)
1690                 {
1691                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1692                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1693                     // if x or y, use vi128XY permute result, else use vi128ZW
1694                     Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1695                     Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1696
1697                     if (bFP)
1698                     {
1699                         // extract 128 bit lanes to sign extend each component
1700                         Value* temp_lo =
1701                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1702                         Value* temp_hi =
1703                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1704
1705                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1706                     }
1707                     else
1708                     {
1709                         // extract 128 bit lanes to sign extend each component
1710                         Value* temp_lo =
1711                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1712                         Value* temp_hi =
1713                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1714
1715                         Value* temp = JOIN_16(temp_lo, temp_hi);
1716
1717                         // denormalize if needed
1718                         if (conversionType != CONVERT_NONE)
1719                         {
1720                             temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1721                         }
1722
1723                         vVertexElements[currentVertexElement] = temp;
1724                     }
1725
1726                     currentVertexElement += 1;
1727                 }
1728                 else
1729                 {
1730                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1731                 }
1732
1733                 if (currentVertexElement > 3)
1734                 {
1735                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1736                     // reset to the next vVertexElement to output
1737                     currentVertexElement = 0;
1738                 }
1739             }
1740         }
1741     }
1742     // else zero extend
1743     else if ((extendType == Instruction::CastOps::ZExt) ||
1744              (extendType == Instruction::CastOps::UIToFP))
1745     {
1746         // pshufb masks for each component
1747         Value* vConstMask[2];
1748
1749         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1750         {
1751             // x/z shuffle mask
1752             vConstMask[0] = C<char>({
1753                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1754                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1755             });
1756         }
1757
1758         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1759         {
1760             // y/w shuffle mask
1761             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1762                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1763         }
1764
1765         // init denormalize variables if needed
1766         Instruction::CastOps fpCast;
1767         Value*               conversionFactor;
1768
1769         switch (conversionType)
1770         {
1771         case CONVERT_NORMALIZED:
1772             fpCast           = Instruction::CastOps::UIToFP;
1773             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1774             break;
1775         case CONVERT_USCALED:
1776             fpCast           = Instruction::CastOps::UIToFP;
1777             conversionFactor = VIMMED1((float)(1.0f));
1778             break;
1779         case CONVERT_SSCALED:
1780             SWR_INVALID("Type should not be zero extended!");
1781             conversionFactor = nullptr;
1782             break;
1783         default:
1784             SWR_ASSERT(conversionType == CONVERT_NONE);
1785             conversionFactor = nullptr;
1786             break;
1787         }
1788
1789         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1790         for (uint32_t i = 0; i < 4; i++)
1791         {
1792             if (isComponentEnabled(compMask, i))
1793             {
1794                 if (compCtrl[i] == ComponentControl::StoreSrc)
1795                 {
1796                     // select correct constMask for x/z or y/w pshufb
1797                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1798                     // if x or y, use vi128XY permute result, else use vi128ZW
1799                     uint32_t selectedGather = (i < 2) ? 0 : 1;
1800
1801                     // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL,
1802                     // for now..
1803
1804                     Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1805                     Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1806
1807                     Value* temp_lo = BITCAST(
1808                         PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1809                         vGatherTy);
1810                     Value* temp_hi = BITCAST(
1811                         PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1812                         vGatherTy);
1813
1814                     // after pshufb mask for x channel; z uses the same shuffle from the second
1815                     // gather 256i - 0    1    2    3    4    5    6    7
1816                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1817
1818                     Value* temp = JOIN_16(temp_lo, temp_hi);
1819
1820                     // denormalize if needed
1821                     if (conversionType != CONVERT_NONE)
1822                     {
1823                         temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1824                     }
1825
1826                     vVertexElements[currentVertexElement] = temp;
1827
1828                     currentVertexElement += 1;
1829                 }
1830                 else
1831                 {
1832                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1833                 }
1834
1835                 if (currentVertexElement > 3)
1836                 {
1837                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1838                     // reset to the next vVertexElement to output
1839                     currentVertexElement = 0;
1840                 }
1841             }
1842         }
1843     }
1844     else
1845     {
1846         SWR_INVALID("Unsupported conversion type");
1847     }
1848 }
1849
1850 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1851 {
1852     // Unpack tuple args
1853     Value*(&vGatherResult)[2]                       = std::get<0>(args);
1854     Value*                     pVtxOut              = std::get<1>(args);
1855     const Instruction::CastOps extendType           = std::get<2>(args);
1856     const ConversionType       conversionType       = std::get<3>(args);
1857     uint32_t&                  currentVertexElement = std::get<4>(args);
1858     uint32_t&                  outputElt            = std::get<5>(args);
1859     const ComponentEnable      compMask             = std::get<6>(args);
1860     const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1861     Value*(&vVertexElements)[4]                     = std::get<8>(args);
1862
1863     // cast types
1864     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1865     Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1866
1867     // have to do extra work for sign extending
1868     if ((extendType == Instruction::CastOps::SExt) ||
1869         (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1870     {
1871         // is this PP float?
1872         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1873
1874         Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1875         Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
1876                                           mVWidth / 4); // vwidth is units of 32 bits
1877
1878         // shuffle mask
1879         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1880                                      0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1881         Value* vi128XY    = nullptr;
1882         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1883         {
1884             Value* vShufResult =
1885                 BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1886             // after pshufb: group components together in each 128bit lane
1887             // 256i - 0    1    2    3    4    5    6    7
1888             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1889
1890             vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1891             // after PERMD: move and pack xy components into each 128bit lane
1892             // 256i - 0    1    2    3    4    5    6    7
1893             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1894         }
1895
1896         // do the same for zw components
1897         Value* vi128ZW = nullptr;
1898         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1899         {
1900             Value* vShufResult =
1901                 BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1902             vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1903         }
1904
1905         // init denormalize variables if needed
1906         Instruction::CastOps IntToFpCast;
1907         Value*               conversionFactor;
1908
1909         switch (conversionType)
1910         {
1911         case CONVERT_NORMALIZED:
1912             IntToFpCast      = Instruction::CastOps::SIToFP;
1913             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1914             break;
1915         case CONVERT_SSCALED:
1916             IntToFpCast      = Instruction::CastOps::SIToFP;
1917             conversionFactor = VIMMED1((float)(1.0));
1918             break;
1919         case CONVERT_USCALED:
1920             SWR_INVALID("Type should not be sign extended!");
1921             conversionFactor = nullptr;
1922             break;
1923         default:
1924             SWR_ASSERT(conversionType == CONVERT_NONE);
1925             conversionFactor = nullptr;
1926             break;
1927         }
1928
1929         // sign extend all enabled components. If we have a fill vVertexElements, output to current
1930         // simdvertex
1931         for (uint32_t i = 0; i < 4; i++)
1932         {
1933             if (isComponentEnabled(compMask, i))
1934             {
1935                 if (compCtrl[i] == ComponentControl::StoreSrc)
1936                 {
1937                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1938                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1939                     // if x or y, use vi128XY permute result, else use vi128ZW
1940                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1941
1942                     if (bFP)
1943                     {
1944                         // extract 128 bit lanes to sign extend each component
1945                         vVertexElements[currentVertexElement] =
1946                             CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1947                     }
1948                     else
1949                     {
1950                         // extract 128 bit lanes to sign extend each component
1951                         vVertexElements[currentVertexElement] =
1952                             PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1953
1954                         // denormalize if needed
1955                         if (conversionType != CONVERT_NONE)
1956                         {
1957                             vVertexElements[currentVertexElement] =
1958                                 FMUL(CAST(IntToFpCast,
1959                                           vVertexElements[currentVertexElement],
1960                                           mSimdFP32Ty),
1961                                      conversionFactor);
1962                         }
1963                     }
1964                     currentVertexElement++;
1965                 }
1966                 else
1967                 {
1968                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1969                 }
1970
1971                 if (currentVertexElement > 3)
1972                 {
1973                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1974                     // reset to the next vVertexElement to output
1975                     currentVertexElement = 0;
1976                 }
1977             }
1978         }
1979     }
1980     // else zero extend
1981     else if ((extendType == Instruction::CastOps::ZExt) ||
1982              (extendType == Instruction::CastOps::UIToFP))
1983     {
1984         // pshufb masks for each component
1985         Value* vConstMask[2];
1986         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1987         {
1988             // x/z shuffle mask
1989             vConstMask[0] = C<char>({
1990                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1991                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1992             });
1993         }
1994
1995         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1996         {
1997             // y/w shuffle mask
1998             vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1999                                      2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2000         }
2001
2002         // init denormalize variables if needed
2003         Instruction::CastOps fpCast;
2004         Value*               conversionFactor;
2005
2006         switch (conversionType)
2007         {
2008         case CONVERT_NORMALIZED:
2009             fpCast           = Instruction::CastOps::UIToFP;
2010             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2011             break;
2012         case CONVERT_USCALED:
2013             fpCast           = Instruction::CastOps::UIToFP;
2014             conversionFactor = VIMMED1((float)(1.0f));
2015             break;
2016         case CONVERT_SSCALED:
2017             SWR_INVALID("Type should not be zero extended!");
2018             conversionFactor = nullptr;
2019             break;
2020         default:
2021             SWR_ASSERT(conversionType == CONVERT_NONE);
2022             conversionFactor = nullptr;
2023             break;
2024         }
2025
2026         // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2027         for (uint32_t i = 0; i < 4; i++)
2028         {
2029             if (isComponentEnabled(compMask, i))
2030             {
2031                 if (compCtrl[i] == ComponentControl::StoreSrc)
2032                 {
2033                     // select correct constMask for x/z or y/w pshufb
2034                     uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2035                     // if x or y, use vi128XY permute result, else use vi128ZW
2036                     uint32_t selectedGather = (i < 2) ? 0 : 1;
2037
2038                     vVertexElements[currentVertexElement] =
2039                         BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2040                                        vConstMask[selectedMask]),
2041                                 vGatherTy);
2042                     // after pshufb mask for x channel; z uses the same shuffle from the second
2043                     // gather 256i - 0    1    2    3    4    5    6    7
2044                     //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2045
2046                     // denormalize if needed
2047                     if (conversionType != CONVERT_NONE)
2048                     {
2049                         vVertexElements[currentVertexElement] =
2050                             FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2051                                  conversionFactor);
2052                     }
2053                     currentVertexElement++;
2054                 }
2055                 else
2056                 {
2057                     vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2058                 }
2059
2060                 if (currentVertexElement > 3)
2061                 {
2062                     StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2063                     // reset to the next vVertexElement to output
2064                     currentVertexElement = 0;
2065                 }
2066             }
2067         }
2068     }
2069     else
2070     {
2071         SWR_INVALID("Unsupported conversion type");
2072     }
2073 }
2074
2075 //////////////////////////////////////////////////////////////////////////
2076 /// @brief Output a simdvertex worth of elements to the current outputElt
2077 /// @param pVtxOut - base address of VIN output struct
2078 /// @param outputElt - simdvertex offset in VIN to write to
2079 /// @param numEltsToStore - number of simdvertex rows to write out
2080 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2081 void FetchJit::StoreVertexElements(Value*         pVtxOut,
2082                                    const uint32_t outputElt,
2083                                    const uint32_t numEltsToStore,
2084                                    Value* (&vVertexElements)[4])
2085 {
2086     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2087
2088     for (uint32_t c = 0; c < numEltsToStore; ++c)
2089     {
2090         // STORE expects FP32 x vWidth type, just bitcast if needed
2091         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2092         {
2093 #if FETCH_DUMP_VERTEX
2094             PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2095 #endif
2096             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2097         }
2098 #if FETCH_DUMP_VERTEX
2099         else
2100         {
2101             PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2102         }
2103 #endif
2104         // outputElt * 4 = offsetting by the size of a simdvertex
2105         // + c offsets to a 32bit x vWidth row within the current vertex
2106         Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2107         STORE(vVertexElements[c], dest);
2108     }
2109 }
2110
2111 //////////////////////////////////////////////////////////////////////////
2112 /// @brief Generates a constant vector of values based on the
2113 /// ComponentControl value
2114 /// @param ctrl - ComponentControl value
2115 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2116 {
2117     switch (ctrl)
2118     {
2119     case NoStore:
2120         return VUNDEF_I();
2121     case Store0:
2122         return VIMMED1(0);
2123     case Store1Fp:
2124         return VIMMED1(1.0f);
2125     case Store1Int:
2126         return VIMMED1(1);
2127     case StoreVertexId:
2128     {
2129         if (mVWidth == 16)
2130         {
2131             Type*  pSimd8FPTy = VectorType::get(mFP32Ty, 8);
2132             Value* pIdLo =
2133                 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2134             Value* pIdHi =
2135                 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2136             return JOIN_16(pIdLo, pIdHi);
2137         }
2138         else
2139         {
2140             return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2141         }
2142     }
2143     case StoreInstanceId:
2144     {
2145         Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2146         return VBROADCAST(pId);
2147     }
2148
2149
2150     case StoreSrc:
2151     default:
2152         SWR_INVALID("Invalid component control");
2153         return VUNDEF_I();
2154     }
2155 }
2156
2157 //////////////////////////////////////////////////////////////////////////
2158 /// @brief Returns the enable mask for the specified component.
2159 /// @param enableMask - enable bits
2160 /// @param component - component to check if enabled.
2161 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2162 {
2163     switch (component)
2164     {
2165         // X
2166     case 0:
2167         return (enableMask & ComponentEnable::X);
2168         // Y
2169     case 1:
2170         return (enableMask & ComponentEnable::Y);
2171         // Z
2172     case 2:
2173         return (enableMask & ComponentEnable::Z);
2174         // W
2175     case 3:
2176         return (enableMask & ComponentEnable::W);
2177
2178     default:
2179         return false;
2180     }
2181 }
2182
2183 // Don't want two threads compiling the same fetch shader simultaneously
2184 // Has problems in the JIT cache implementation
2185 // This is only a problem for fetch right now.
2186 static std::mutex gFetchCodegenMutex;
2187
2188 //////////////////////////////////////////////////////////////////////////
2189 /// @brief JITs from fetch shader IR
2190 /// @param hJitMgr - JitManager handle
2191 /// @param func   - LLVM function IR
2192 /// @return PFN_FETCH_FUNC - pointer to fetch code
2193 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2194 {
2195     const llvm::Function* func    = (const llvm::Function*)hFunc;
2196     JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2197     PFN_FETCH_FUNC        pfnFetch;
2198
2199     gFetchCodegenMutex.lock();
2200     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2201     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2202     // add new IR to the module
2203     pJitMgr->mIsModuleFinalized = true;
2204
2205 #if defined(KNOB_SWRC_TRACING)
2206     char        fName[1024];
2207     const char* funcName = func->getName().data();
2208     sprintf(fName, "%s.bin", funcName);
2209     FILE* fd = fopen(fName, "wb");
2210     fwrite((void*)pfnFetch, 1, 2048, fd);
2211     fclose(fd);
2212 #endif
2213
2214     pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2215     gFetchCodegenMutex.unlock();
2216
2217
2218     return pfnFetch;
2219 }
2220
2221 //////////////////////////////////////////////////////////////////////////
2222 /// @brief JIT compiles fetch shader
2223 /// @param hJitMgr - JitManager handle
2224 /// @param state   - fetch state to build function from
2225 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2226 {
2227     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2228
2229     pJitMgr->SetupNewModule();
2230
2231     FetchJit theJit(pJitMgr);
2232     HANDLE   hFunc = theJit.Create(state);
2233
2234     return JitFetchFunc(hJitMgr, hFunc);
2235 }