1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation of the fetch jitter
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder_gfx_mem.h"
33 #include "fetch_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
37 //#define FETCH_DUMP_VERTEX 1
39 using namespace SwrJit;
41 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
52 //////////////////////////////////////////////////////////////////////////
53 /// Interface to Jitting a fetch shader
54 //////////////////////////////////////////////////////////////////////////
55 struct FetchJit : public BuilderGfxMem
57 FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {}
59 Function* Create(const FETCH_COMPILE_STATE& fetchState);
61 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
62 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
63 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
65 Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
67 // package up Shuffle*bpcGatherd args into a tuple for convenience
68 typedef std::tuple<Value*&,
70 const Instruction::CastOps,
74 const ComponentEnable,
75 const ComponentControl (&)[4],
77 const uint32_t (&)[4]>
80 void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
81 void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
83 typedef std::tuple<Value* (&)[2],
85 const Instruction::CastOps,
89 const ComponentEnable,
90 const ComponentControl (&)[4],
94 void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
95 void Shuffle16bpcGather(Shuffle16bpcArgs& args);
97 void StoreVertexElements(Value* pVtxOut,
98 const uint32_t outputElt,
99 const uint32_t numEltsToStore,
100 Value* (&vVertexElements)[4]);
102 Value* GenerateCompCtrlVector(const ComponentControl ctrl);
104 void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
109 bool IsOddFormat(SWR_FORMAT format);
110 bool IsUniformFormat(SWR_FORMAT format);
111 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
112 void CreateGatherOddFormats(
113 SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
114 void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
120 Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
122 std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
123 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
125 Function* fetch = Function::Create(
126 JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
127 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
129 fetch->getParent()->setModuleIdentifier(fetch->getName());
131 IRB()->SetInsertPoint(entry);
133 auto argitr = fetch->arg_begin();
135 // Fetch shader arguments
136 Value* privateContext = &*argitr;
138 privateContext->setName("privateContext");
139 SetPrivateContext(privateContext);
141 mpWorkerData = &*argitr;
143 mpWorkerData->setName("pWorkerData");
144 mpFetchInfo = &*argitr;
146 mpFetchInfo->setName("fetchInfo");
147 Value* pVtxOut = &*argitr;
148 pVtxOut->setName("vtxOutput");
150 uint32_t baseWidth = mVWidth;
152 SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
154 // Override builder target width to force 16-wide SIMD
155 #if USE_SIMD16_SHADERS
159 pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
161 // SWR_FETCH_CONTEXT::pStreams
162 Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
163 streams->setName("pStreams");
165 // SWR_FETCH_CONTEXT::pIndices
166 Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
167 indices->setName("pIndices");
169 // SWR_FETCH_CONTEXT::pLastIndex
170 Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
171 pLastIndex->setName("pLastIndex");
174 switch (fetchState.indexType)
177 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
178 if (fetchState.bDisableIndexOOBCheck)
181 BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)),
183 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
187 vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
191 if (fetchState.bDisableIndexOOBCheck)
194 BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)),
196 vIndices = Z_EXT(vIndices, mSimdInt32Ty);
200 vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
204 (fetchState.bDisableIndexOOBCheck)
205 ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
206 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
207 break; // incoming type is already 32bit int
209 SWR_INVALID("Unsupported index type");
214 if (fetchState.bForceSequentialAccessEnable)
216 Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
217 : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
219 // VertexData buffers are accessed sequentially, the index is equal to the vertex number
220 vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
221 vIndices = ADD(vIndices, pOffsets);
224 Value* vVertexId = vIndices;
225 if (fetchState.bVertexIDOffsetEnable)
227 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
229 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
230 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
231 vVertexId = ADD(vIndices, vBaseVertex);
232 vVertexId = ADD(vVertexId, vStartVertex);
235 // store out vertex IDs
238 // store out in simd8 halves until core supports 16-wide natively
239 auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
240 auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
241 STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
242 STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
244 else if (mVWidth == 8)
246 STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
249 // store out cut mask if enabled
250 if (fetchState.bEnableCutIndex)
252 Value* vCutIndex = VIMMED1(fetchState.cutIndex);
253 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
257 auto cutMaskLo = EXTRACT_16(cutMask, 0);
258 auto cutMaskHi = EXTRACT_16(cutMask, 1);
259 STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
260 STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
262 else if (mVWidth == 8)
264 STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
268 // Fetch attributes from memory and output to a simdvertex struct
269 JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
273 JitManager::DumpToFile(fetch, "src");
276 verifyFunction(*fetch);
279 ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
281 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
282 setupPasses.add(createBreakCriticalEdgesPass());
283 setupPasses.add(createCFGSimplificationPass());
284 setupPasses.add(createEarlyCSEPass());
285 setupPasses.add(createPromoteMemoryToRegisterPass());
287 setupPasses.run(*fetch);
289 JitManager::DumpToFile(fetch, "se");
291 ::FunctionPassManager optPasses(JM()->mpCurrentModule);
293 ///@todo Haven't touched these either. Need to remove some of these and add others.
294 optPasses.add(createCFGSimplificationPass());
295 optPasses.add(createEarlyCSEPass());
296 optPasses.add(createInstructionCombiningPass());
297 optPasses.add(createInstructionSimplifierPass());
298 optPasses.add(createConstantPropagationPass());
299 optPasses.add(createSCCPPass());
300 optPasses.add(createAggressiveDCEPass());
302 optPasses.run(*fetch);
304 optPasses.add(createLowerX86Pass(this));
305 optPasses.run(*fetch);
307 JitManager::DumpToFile(fetch, "opt");
310 // Revert 16-wide override
311 #if USE_SIMD16_SHADERS
312 SetTargetWidth(baseWidth);
318 // returns true for odd formats that require special state.gather handling
319 bool FetchJit::IsOddFormat(SWR_FORMAT format)
321 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
322 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
329 // format is uniform if all components are the same size and type
330 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
332 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
333 uint32_t bpc0 = info.bpc[0];
334 uint32_t type0 = info.type[0];
336 for (uint32_t c = 1; c < info.numComps; ++c)
338 if (bpc0 != info.bpc[c] || type0 != info.type[c])
346 // unpacks components based on format
347 // foreach component in the pixel
348 // mask off everything but this component
349 // shift component to LSB
350 void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
352 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
354 uint32_t bitOffset = 0;
355 for (uint32_t c = 0; c < info.numComps; ++c)
357 uint32_t swizzledIndex = info.swizzle[c];
358 uint32_t compBits = info.bpc[c];
359 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
360 Value* comp = AND(vInput, bitmask);
361 comp = LSHR(comp, bitOffset);
363 result[swizzledIndex] = comp;
364 bitOffset += compBits;
368 // gather for odd component size formats
369 // gather SIMD full pixels per lane then shift/mask to move each component to their
371 void FetchJit::CreateGatherOddFormats(
372 SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
374 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
376 // only works if pixel size is <= 32bits
377 SWR_ASSERT(info.bpp <= 32);
382 pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
386 // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
387 Value* pMem = ALLOCA(mSimdInt32Ty);
388 STORE(VIMMED1(0u), pMem);
390 pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
391 Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
393 for (uint32_t lane = 0; lane < mVWidth; ++lane)
396 Value* index = VEXTRACT(pOffsets, C(lane));
397 Value* mask = VEXTRACT(pMask, C(lane));
402 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
403 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
404 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
410 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
411 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
412 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
419 // First 16-bits of data
420 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
421 Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
422 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
424 // Last 8-bits of data
425 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
426 pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
427 STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
432 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
437 pGather = LOAD(pMem);
440 for (uint32_t comp = 0; comp < 4; ++comp)
442 pResult[comp] = VIMMED1((int)info.defaults[comp]);
445 UnpackComponents(format, pGather, pResult);
448 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
449 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
450 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
451 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
454 void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
456 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
458 for (uint32_t c = 0; c < info.numComps; ++c)
460 uint32_t compIndex = info.swizzle[c];
462 // skip any conversion on UNUSED components
463 if (info.type[c] == SWR_TYPE_UNUSED)
468 if (info.isNormalized[c])
470 if (info.type[c] == SWR_TYPE_SNORM)
472 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
475 /// result = c * (1.0f / (2^(n-1) - 1);
476 uint32_t n = info.bpc[c];
477 uint32_t pow2 = 1 << (n - 1);
478 float scale = 1.0f / (float)(pow2 - 1);
479 Value* vScale = VIMMED1(scale);
480 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
481 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
482 texels[compIndex] = FMUL(texels[compIndex], vScale);
486 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
488 /// result = c * (1.0f / (2^n - 1))
489 uint32_t n = info.bpc[c];
490 uint32_t pow2 = 1 << n;
491 // special case 24bit unorm format, which requires a full divide to meet ULP
495 float scale = (float)(pow2 - 1);
496 Value* vScale = VIMMED1(scale);
497 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
498 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
499 texels[compIndex] = FDIV(texels[compIndex], vScale);
503 float scale = 1.0f / (float)(pow2 - 1);
504 Value* vScale = VIMMED1(scale);
505 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
506 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
507 texels[compIndex] = FMUL(texels[compIndex], vScale);
515 //////////////////////////////////////////////////////////////////////////
516 /// @brief Loads attributes from memory using AVX2 GATHER(s)
517 /// @param fetchState - info about attributes to be fetched from memory
518 /// @param streams - value pointer to the current vertex stream
519 /// @param vIndices - vector value of indices to gather
520 /// @param pVtxOut - value pointer to output simdvertex struct
521 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
526 uint32_t currentVertexElement = 0;
527 uint32_t outputElt = 0;
528 Value* vVertexElements[4];
530 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
531 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
532 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
533 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
534 curInstance->setName("curInstance");
536 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
538 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
540 // skip element if all components are disabled
541 if (ied.ComponentPacking == ComponentEnable::NONE)
546 const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
547 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
550 info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
552 Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
554 // VGATHER* takes an *i8 src pointer
555 Value* pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
557 Value* stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
558 Value* vStride = VBROADCAST(stride);
560 // max vertex index that is fully in bounds
561 Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
562 maxVertex = LOAD(maxVertex);
564 Value* minVertex = NULL;
565 if (fetchState.bPartialVertexBuffer)
567 // min vertex index for low bounds OOB checking
568 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
569 minVertex = LOAD(minVertex);
572 if (fetchState.bInstanceIDOffsetEnable)
574 // the InstanceID (curInstance) value is offset by StartInstanceLocation
575 curInstance = ADD(curInstance, startInstance);
580 Value* vInstanceStride = VIMMED1(0);
582 if (ied.InstanceEnable)
584 Value* stepRate = C(ied.InstanceAdvancementState);
586 // prevent a div by 0 for 0 step rate
587 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
588 stepRate = SELECT(isNonZeroStep, stepRate, C(1));
590 // calc the current offset into instanced data buffer
591 Value* calcInstance = UDIV(curInstance, stepRate);
593 // if step rate is 0, every instance gets instance 0
594 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
596 vCurIndices = VBROADCAST(calcInstance);
597 startOffset = startInstance;
599 else if (ied.InstanceStrideEnable)
601 // grab the instance advancement state, determines stride in bytes from one instance to
603 Value* stepRate = C(ied.InstanceAdvancementState);
604 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
606 // offset indices by baseVertex
607 vCurIndices = ADD(vIndices, vBaseVertex);
609 startOffset = startVertex;
610 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
614 // offset indices by baseVertex
615 vCurIndices = ADD(vIndices, vBaseVertex);
616 startOffset = startVertex;
619 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
620 // do 64bit address offset calculations.
622 // calculate byte offset to the start of the VB
623 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
624 pStreamBase = GEP(pStreamBase, baseOffset);
625 Value* pStreamBaseGFX = ADD(stream, baseOffset);
627 // if we have a start offset, subtract from max vertex. Used for OOB check
628 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
629 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
630 // if we have a negative value, we're already OOB. clamp at 0.
631 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
633 if (fetchState.bPartialVertexBuffer)
635 // similary for min vertex
636 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
637 Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
638 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
641 // Load the in bounds size of a partially valid vertex
642 Value* partialInboundsSize =
643 GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
644 partialInboundsSize = LOAD(partialInboundsSize);
645 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
646 Value* vBpp = VBROADCAST(C(info.Bpp));
647 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
649 // is the element is <= the partially valid size
650 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
652 // override cur indices with 0 if pitch is 0
653 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
654 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
656 // are vertices partially OOB?
657 Value* vMaxVertex = VBROADCAST(maxVertex);
658 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
660 // are vertices fully in bounds?
661 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
664 if (fetchState.bPartialVertexBuffer)
666 // are vertices below minVertex limit?
667 Value* vMinVertex = VBROADCAST(minVertex);
668 Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
670 // only fetch lanes that pass both tests
671 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
675 vGatherMask = vMaxGatherMask;
678 // blend in any partially OOB indices that have valid elements
679 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
681 // calculate the actual offsets into the VB
682 Value* vOffsets = MUL(vCurIndices, vStride);
683 vOffsets = ADD(vOffsets, vAlignmentOffsets);
685 // if instance stride enable is:
686 // true - add product of the instanceID and advancement state to the offst into the VB
687 // false - value of vInstanceStride has been initialialized to zero
688 vOffsets = ADD(vOffsets, vInstanceStride);
690 // Packing and component control
691 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
692 const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
693 (ComponentControl)ied.ComponentControl1,
694 (ComponentControl)ied.ComponentControl2,
695 (ComponentControl)ied.ComponentControl3};
697 // Special gather/conversion for formats without equal component sizes
698 if (IsOddFormat((SWR_FORMAT)ied.Format))
701 CreateGatherOddFormats(
702 (SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
703 ConvertFormat((SWR_FORMAT)ied.Format, pResults);
705 for (uint32_t c = 0; c < 4; c += 1)
707 if (isComponentEnabled(compMask, c))
709 vVertexElements[currentVertexElement++] = pResults[c];
710 if (currentVertexElement > 3)
712 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
713 // reset to the next vVertexElement to output
714 currentVertexElement = 0;
719 else if (info.type[0] == SWR_TYPE_FLOAT)
721 ///@todo: support 64 bit vb accesses
722 Value* gatherSrc = VIMMED1(0.0f);
724 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
725 "Unsupported format for standard gather fetch.");
727 // Gather components from memory to store in a simdvertex structure
732 Value* vGatherResult[2];
734 // if we have at least one component out of x or y to fetch
735 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
737 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
738 // e.g. result of first 8x32bit integer gather for 16bit components
739 // 256i - 0 1 2 3 4 5 6 7
740 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
744 // if we have at least one component out of z or w to fetch
745 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
747 // offset base to the next components(zw) in the vertex to gather
748 pStreamBase = GEP(pStreamBase, C((char)4));
750 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
751 // e.g. result of second 8x32bit integer gather for 16bit components
752 // 256i - 0 1 2 3 4 5 6 7
753 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
757 // if we have at least one component to shuffle into place
760 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
762 Instruction::CastOps::FPExt,
764 currentVertexElement,
770 // Shuffle gathered components into place in simdvertex struct
771 mVWidth == 16 ? Shuffle16bpcGather16(args)
772 : Shuffle16bpcGather(args); // outputs to vVertexElements ref
778 for (uint32_t i = 0; i < 4; i += 1)
780 if (isComponentEnabled(compMask, i))
782 // if we need to gather the component
783 if (compCtrl[i] == StoreSrc)
785 // Gather a SIMD of vertices
786 // APIs allow a 4GB range for offsets
787 // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
788 // But, we know that elements must be aligned for FETCH. :)
789 // Right shift the offset by a bit and then scale by 2 to remove the
791 Value* vShiftedOffsets = LSHR(vOffsets, 1);
792 vVertexElements[currentVertexElement++] =
798 GFX_MEM_CLIENT_FETCH);
802 vVertexElements[currentVertexElement++] =
803 GenerateCompCtrlVector(compCtrl[i]);
806 if (currentVertexElement > 3)
808 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
809 // reset to the next vVertexElement to output
810 currentVertexElement = 0;
814 // offset base to the next component in the vertex to gather
815 pStreamBase = GEP(pStreamBase, C((char)4));
816 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
822 for (uint32_t i = 0; i < 4; i += 1)
824 if (isComponentEnabled(compMask, i))
826 // if we need to gather the component
827 if (compCtrl[i] == StoreSrc)
835 vShufLo = C({0, 1, 2, 3});
836 vShufHi = C({4, 5, 6, 7});
837 vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
841 SWR_ASSERT(mVWidth == 16);
842 vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
843 vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
845 C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
848 Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
849 Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
851 Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
852 Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
854 Value* vZeroDouble = VECTOR_SPLAT(
855 mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
858 GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
860 GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
862 pGatherLo = VCVTPD2PS(pGatherLo);
863 pGatherHi = VCVTPD2PS(pGatherHi);
865 Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
867 vVertexElements[currentVertexElement++] = pGather;
871 vVertexElements[currentVertexElement++] =
872 GenerateCompCtrlVector(compCtrl[i]);
875 if (currentVertexElement > 3)
877 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
878 // reset to the next vVertexElement to output
879 currentVertexElement = 0;
883 // offset base to the next component in the vertex to gather
884 pStreamBase = GEP(pStreamBase, C((char)8));
889 SWR_INVALID("Tried to fetch invalid FP format");
895 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
896 ConversionType conversionType = CONVERT_NONE;
898 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
899 "Unsupported format for standard gather fetch.");
901 switch (info.type[0])
904 conversionType = CONVERT_NORMALIZED;
906 extendCastType = Instruction::CastOps::ZExt;
909 conversionType = CONVERT_NORMALIZED;
911 extendCastType = Instruction::CastOps::SExt;
913 case SWR_TYPE_USCALED:
914 conversionType = CONVERT_USCALED;
915 extendCastType = Instruction::CastOps::UIToFP;
917 case SWR_TYPE_SSCALED:
918 conversionType = CONVERT_SSCALED;
919 extendCastType = Instruction::CastOps::SIToFP;
921 case SWR_TYPE_SFIXED:
922 conversionType = CONVERT_SFIXED;
923 extendCastType = Instruction::CastOps::SExt;
929 // value substituted when component of gather is masked
930 Value* gatherSrc = VIMMED1(0);
932 // Gather components from memory to store in a simdvertex structure
937 // if we have at least one component to fetch
940 Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
941 // e.g. result of an 8x32bit integer gather for 8bit components
942 // 256i - 0 1 2 3 4 5 6 7
943 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
945 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
949 currentVertexElement,
956 // Shuffle gathered components into place in simdvertex struct
957 mVWidth == 16 ? Shuffle8bpcGatherd16(args)
958 : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
964 Value* vGatherResult[2];
966 // if we have at least one component out of x or y to fetch
967 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
969 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
970 // e.g. result of first 8x32bit integer gather for 16bit components
971 // 256i - 0 1 2 3 4 5 6 7
972 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
976 // if we have at least one component out of z or w to fetch
977 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
979 // offset base to the next components(zw) in the vertex to gather
980 pStreamBase = GEP(pStreamBase, C((char)4));
982 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
983 // e.g. result of second 8x32bit integer gather for 16bit components
984 // 256i - 0 1 2 3 4 5 6 7
985 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
989 // if we have at least one component to shuffle into place
992 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
996 currentVertexElement,
1002 // Shuffle gathered components into place in simdvertex struct
1003 mVWidth == 16 ? Shuffle16bpcGather16(args)
1004 : Shuffle16bpcGather(args); // outputs to vVertexElements ref
1010 // Gathered components into place in simdvertex struct
1011 for (uint32_t i = 0; i < 4; i++)
1013 if (isComponentEnabled(compMask, i))
1015 // if we need to gather the component
1016 if (compCtrl[i] == StoreSrc)
1019 GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
1021 if (conversionType == CONVERT_USCALED)
1023 pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1025 else if (conversionType == CONVERT_SSCALED)
1027 pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1029 else if (conversionType == CONVERT_SFIXED)
1031 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1032 VBROADCAST(C(1 / 65536.0f)));
1035 vVertexElements[currentVertexElement++] = pGather;
1037 // e.g. result of a single 8x32bit integer gather for 32bit components
1038 // 256i - 0 1 2 3 4 5 6 7
1039 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1043 vVertexElements[currentVertexElement++] =
1044 GenerateCompCtrlVector(compCtrl[i]);
1047 if (currentVertexElement > 3)
1049 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1051 // reset to the next vVertexElement to output
1052 currentVertexElement = 0;
1056 // offset base to the next component in the vertex to gather
1057 pStreamBase = GEP(pStreamBase, C((char)4));
1065 // if we have a partially filled vVertexElement struct, output it
1066 if (currentVertexElement > 0)
1068 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1072 template <typename T>
1073 Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1075 SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1076 "Function expects gfxptr_t for both input parameters.");
1080 static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1081 "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1082 constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1087 else if (sizeof(T) == sizeof(uint8_t))
1093 SWR_ASSERT(false, "This should never happen as per static_assert above.");
1096 Value* vIndices = VUNDEF_I();
1099 // store 0 index on stack to be used to conditionally load from if index address is OOB
1100 Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1101 STORE(C((T)0), pZeroIndex);
1103 // Load a SIMD of index pointers
1104 for (int64_t lane = 0; lane < mVWidth; lane++)
1106 // Calculate the address of the requested index
1107 Value* pIndex = GEP(pIndices, C(lane), Ty);
1109 pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1111 // check if the address is less than the max index,
1112 Value* mask = ICMP_ULT(pIndex, pLastIndex);
1114 // if valid, load the index. if not, load 0 from the stack
1115 Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1116 Value* index = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH);
1118 // zero extended index to 32 bits and insert into the correct simd lane
1119 index = Z_EXT(index, mInt32Ty);
1120 vIndices = VINSERT(vIndices, index, lane);
1127 //////////////////////////////////////////////////////////////////////////
1128 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1129 /// *Note* have to do 8bit index checking in scalar until we have AVX-512
1131 /// @param pIndices - pointer to 8 bit indices
1132 /// @param pLastIndex - pointer to last valid index
1133 Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1135 return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1138 //////////////////////////////////////////////////////////////////////////
1139 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1140 /// *Note* have to do 16bit index checking in scalar until we have AVX-512
1142 /// @param pIndices - pointer to 16 bit indices
1143 /// @param pLastIndex - pointer to last valid index
1144 Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1146 return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief Loads a simd of valid indices. OOB indices are set to 0
1151 /// @param pIndices - pointer to 32 bit indices
1152 /// @param pLastIndex - pointer to last valid index
1153 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1155 DataLayout dL(JM()->mpCurrentModule);
1156 Value* iLastIndex = pLastIndex;
1157 Value* iIndices = pIndices;
1159 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1160 Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1161 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
1162 numIndicesLeft = SDIV(numIndicesLeft, C(4));
1164 // create a vector of index counts from the base index ptr passed into the fetch
1165 Constant* vIndexOffsets;
1168 vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1172 vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1175 // compare index count to the max valid index
1176 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load
1177 // vIndexOffsets 0 1 2 3 4 5 6 7
1178 // ------------------------------
1179 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
1180 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1181 Value* vMaxIndex = VBROADCAST(numIndicesLeft);
1182 Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1184 // Load the indices; OOB loads 0
1185 return MASKED_LOAD(pIndices,
1190 PointerType::get(mSimdInt32Ty, 0),
1191 GFX_MEM_CLIENT_FETCH);
1194 //////////////////////////////////////////////////////////////////////////
1195 /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1196 /// denormalizes if needed, converts to F32 if needed, and positions in
1197 // the proper SIMD rows to be output to the simdvertex structure
1198 /// @param args: (tuple of args, listed below)
1199 /// @param vGatherResult - 8 gathered 8bpc vertices
1200 /// @param pVtxOut - base pointer to output simdvertex struct
1201 /// @param extendType - sign extend or zero extend
1202 /// @param bNormalized - do we need to denormalize?
1203 /// @param currentVertexElement - reference to the current vVertexElement
1204 /// @param outputElt - reference to the current offset from simdvertex we're o
1205 /// @param compMask - component packing mask
1206 /// @param compCtrl - component control val
1207 /// @param vVertexElements[4] - vertex components to output
1208 /// @param swizzle[4] - component swizzle location
1209 void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1211 // Unpack tuple args
1212 Value*& vGatherResult = std::get<0>(args);
1213 Value* pVtxOut = std::get<1>(args);
1214 const Instruction::CastOps extendType = std::get<2>(args);
1215 const ConversionType conversionType = std::get<3>(args);
1216 uint32_t& currentVertexElement = std::get<4>(args);
1217 uint32_t& outputElt = std::get<5>(args);
1218 const ComponentEnable compMask = std::get<6>(args);
1219 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1220 Value*(&vVertexElements)[4] = std::get<8>(args);
1221 const uint32_t(&swizzle)[4] = std::get<9>(args);
1224 Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1225 Type* v32x8Ty = VectorType::get(mInt8Ty, 32);
1227 // have to do extra work for sign extending
1228 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1230 Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1231 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1233 // shuffle mask, including any swizzling
1234 const char x = (char)swizzle[0];
1235 const char y = (char)swizzle[1];
1236 const char z = (char)swizzle[2];
1237 const char w = (char)swizzle[3];
1238 Value* vConstMask = C<char>(
1239 {char(x), char(x + 4), char(x + 8), char(x + 12), char(y), char(y + 4),
1240 char(y + 8), char(y + 12), char(z), char(z + 4), char(z + 8), char(z + 12),
1241 char(w), char(w + 4), char(w + 8), char(w + 12), char(x), char(x + 4),
1242 char(x + 8), char(x + 12), char(y), char(y + 4), char(y + 8), char(y + 12),
1243 char(z), char(z + 4), char(z + 8), char(z + 12), char(w), char(w + 4),
1244 char(w + 8), char(w + 12)});
1246 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1248 Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1249 Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1251 Value* vShufResult_lo =
1252 BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1253 Value* vShufResult_hi =
1254 BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1256 // after pshufb: group components together in each 128bit lane
1257 // 256i - 0 1 2 3 4 5 6 7
1258 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1260 Value* vi128XY_lo = nullptr;
1261 Value* vi128XY_hi = nullptr;
1262 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1264 vi128XY_lo = BITCAST(
1265 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1267 vi128XY_hi = BITCAST(
1268 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1271 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1272 // 256i - 0 1 2 3 4 5 6 7
1273 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1276 // do the same for zw components
1277 Value* vi128ZW_lo = nullptr;
1278 Value* vi128ZW_hi = nullptr;
1279 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1281 vi128ZW_lo = BITCAST(
1282 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1284 vi128ZW_hi = BITCAST(
1285 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1289 // init denormalize variables if needed
1290 Instruction::CastOps fpCast;
1291 Value* conversionFactor;
1293 switch (conversionType)
1295 case CONVERT_NORMALIZED:
1296 fpCast = Instruction::CastOps::SIToFP;
1297 conversionFactor = VIMMED1((float)(1.0 / 127.0));
1299 case CONVERT_SSCALED:
1300 fpCast = Instruction::CastOps::SIToFP;
1301 conversionFactor = VIMMED1((float)(1.0));
1303 case CONVERT_USCALED:
1304 SWR_INVALID("Type should not be sign extended!");
1305 conversionFactor = nullptr;
1308 SWR_ASSERT(conversionType == CONVERT_NONE);
1309 conversionFactor = nullptr;
1313 // sign extend all enabled components. If we have a fill vVertexElements, output to current
1315 for (uint32_t i = 0; i < 4; i++)
1317 if (isComponentEnabled(compMask, i))
1319 if (compCtrl[i] == ComponentControl::StoreSrc)
1321 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1322 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1323 // if x or y, use vi128XY permute result, else use vi128ZW
1324 Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1325 Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1329 PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1331 PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1333 Value* temp = JOIN_16(temp_lo, temp_hi);
1335 // denormalize if needed
1336 if (conversionType != CONVERT_NONE)
1338 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1341 vVertexElements[currentVertexElement] = temp;
1343 currentVertexElement += 1;
1347 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1350 if (currentVertexElement > 3)
1352 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1353 // reset to the next vVertexElement to output
1354 currentVertexElement = 0;
1360 else if ((extendType == Instruction::CastOps::ZExt) ||
1361 (extendType == Instruction::CastOps::UIToFP))
1363 // init denormalize variables if needed
1364 Instruction::CastOps fpCast;
1365 Value* conversionFactor;
1367 switch (conversionType)
1369 case CONVERT_NORMALIZED:
1370 fpCast = Instruction::CastOps::UIToFP;
1371 conversionFactor = VIMMED1((float)(1.0 / 255.0));
1373 case CONVERT_USCALED:
1374 fpCast = Instruction::CastOps::UIToFP;
1375 conversionFactor = VIMMED1((float)(1.0));
1377 case CONVERT_SSCALED:
1378 SWR_INVALID("Type should not be zero extended!");
1379 conversionFactor = nullptr;
1382 SWR_ASSERT(conversionType == CONVERT_NONE);
1383 conversionFactor = nullptr;
1387 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1388 for (uint32_t i = 0; i < 4; i++)
1390 if (isComponentEnabled(compMask, i))
1392 if (compCtrl[i] == ComponentControl::StoreSrc)
1394 // pshufb masks for each component
1401 C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1402 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1407 C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1408 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1413 C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1414 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1419 C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1420 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1423 vConstMask = nullptr;
1427 Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1428 Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1431 BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1433 BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1435 // after pshufb for x channel
1436 // 256i - 0 1 2 3 4 5 6 7
1437 // x000 x000 x000 x000 x000 x000 x000 x000
1439 Value* temp = JOIN_16(temp_lo, temp_hi);
1441 // denormalize if needed
1442 if (conversionType != CONVERT_NONE)
1444 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1447 vVertexElements[currentVertexElement] = temp;
1449 currentVertexElement += 1;
1453 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1456 if (currentVertexElement > 3)
1458 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1459 // reset to the next vVertexElement to output
1460 currentVertexElement = 0;
1467 SWR_INVALID("Unsupported conversion type");
1471 void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1473 // Unpack tuple args
1474 Value*& vGatherResult = std::get<0>(args);
1475 Value* pVtxOut = std::get<1>(args);
1476 const Instruction::CastOps extendType = std::get<2>(args);
1477 const ConversionType conversionType = std::get<3>(args);
1478 uint32_t& currentVertexElement = std::get<4>(args);
1479 uint32_t& outputElt = std::get<5>(args);
1480 const ComponentEnable compMask = std::get<6>(args);
1481 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1482 Value*(&vVertexElements)[4] = std::get<8>(args);
1483 const uint32_t(&swizzle)[4] = std::get<9>(args);
1486 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1488 for (uint32_t i = 0; i < 4; i++)
1490 if (!isComponentEnabled(compMask, i))
1493 if (compCtrl[i] == ComponentControl::StoreSrc)
1495 std::vector<uint32_t> vShuffleMasks[4] = {
1496 {0, 4, 8, 12, 16, 20, 24, 28}, // x
1497 {1, 5, 9, 13, 17, 21, 25, 29}, // y
1498 {2, 6, 10, 14, 18, 22, 26, 30}, // z
1499 {3, 7, 11, 15, 19, 23, 27, 31}, // w
1502 Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1503 UndefValue::get(v32x8Ty),
1504 vShuffleMasks[swizzle[i]]);
1506 if ((extendType == Instruction::CastOps::SExt) ||
1507 (extendType == Instruction::CastOps::SIToFP))
1509 switch (conversionType)
1511 case CONVERT_NORMALIZED:
1512 val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1514 case CONVERT_SSCALED:
1515 val = SI_TO_FP(val, mSimdFP32Ty);
1517 case CONVERT_USCALED:
1518 SWR_INVALID("Type should not be sign extended!");
1521 SWR_ASSERT(conversionType == CONVERT_NONE);
1522 val = S_EXT(val, mSimdInt32Ty);
1526 else if ((extendType == Instruction::CastOps::ZExt) ||
1527 (extendType == Instruction::CastOps::UIToFP))
1529 switch (conversionType)
1531 case CONVERT_NORMALIZED:
1532 val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1534 case CONVERT_SSCALED:
1535 SWR_INVALID("Type should not be zero extended!");
1537 case CONVERT_USCALED:
1538 val = UI_TO_FP(val, mSimdFP32Ty);
1541 SWR_ASSERT(conversionType == CONVERT_NONE);
1542 val = Z_EXT(val, mSimdInt32Ty);
1548 SWR_INVALID("Unsupported conversion type");
1551 vVertexElements[currentVertexElement++] = val;
1555 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1558 if (currentVertexElement > 3)
1560 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1561 // reset to the next vVertexElement to output
1562 currentVertexElement = 0;
1567 //////////////////////////////////////////////////////////////////////////
1568 /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1569 /// denormalizes if needed, converts to F32 if needed, and positions in
1570 // the proper SIMD rows to be output to the simdvertex structure
1571 /// @param args: (tuple of args, listed below)
1572 /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1573 /// @param pVtxOut - base pointer to output simdvertex struct
1574 /// @param extendType - sign extend or zero extend
1575 /// @param bNormalized - do we need to denormalize?
1576 /// @param currentVertexElement - reference to the current vVertexElement
1577 /// @param outputElt - reference to the current offset from simdvertex we're o
1578 /// @param compMask - component packing mask
1579 /// @param compCtrl - component control val
1580 /// @param vVertexElements[4] - vertex components to output
1581 void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1583 // Unpack tuple args
1584 Value*(&vGatherResult)[2] = std::get<0>(args);
1585 Value* pVtxOut = std::get<1>(args);
1586 const Instruction::CastOps extendType = std::get<2>(args);
1587 const ConversionType conversionType = std::get<3>(args);
1588 uint32_t& currentVertexElement = std::get<4>(args);
1589 uint32_t& outputElt = std::get<5>(args);
1590 const ComponentEnable compMask = std::get<6>(args);
1591 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1592 Value*(&vVertexElements)[4] = std::get<8>(args);
1595 Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1596 Type* v32x8Ty = VectorType::get(mInt8Ty, 32);
1598 // have to do extra work for sign extending
1599 if ((extendType == Instruction::CastOps::SExt) ||
1600 (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1602 // is this PP float?
1603 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1605 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1606 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1609 Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1610 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1611 Value* vi128XY_lo = nullptr;
1612 Value* vi128XY_hi = nullptr;
1613 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1615 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1618 Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1619 Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1621 Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1622 Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1624 // after pshufb: group components together in each 128bit lane
1625 // 256i - 0 1 2 3 4 5 6 7
1626 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1628 vi128XY_lo = BITCAST(
1629 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1631 vi128XY_hi = BITCAST(
1632 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1635 // after PERMD: move and pack xy components into each 128bit lane
1636 // 256i - 0 1 2 3 4 5 6 7
1637 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1640 // do the same for zw components
1641 Value* vi128ZW_lo = nullptr;
1642 Value* vi128ZW_hi = nullptr;
1643 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1645 Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1646 Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1648 Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1649 Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1651 vi128ZW_lo = BITCAST(
1652 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1654 vi128ZW_hi = BITCAST(
1655 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1659 // init denormalize variables if needed
1660 Instruction::CastOps IntToFpCast;
1661 Value* conversionFactor;
1663 switch (conversionType)
1665 case CONVERT_NORMALIZED:
1666 IntToFpCast = Instruction::CastOps::SIToFP;
1667 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1669 case CONVERT_SSCALED:
1670 IntToFpCast = Instruction::CastOps::SIToFP;
1671 conversionFactor = VIMMED1((float)(1.0));
1673 case CONVERT_USCALED:
1674 SWR_INVALID("Type should not be sign extended!");
1675 conversionFactor = nullptr;
1678 SWR_ASSERT(conversionType == CONVERT_NONE);
1679 conversionFactor = nullptr;
1683 // sign extend all enabled components. If we have a fill vVertexElements, output to current
1685 for (uint32_t i = 0; i < 4; i++)
1687 if (isComponentEnabled(compMask, i))
1689 if (compCtrl[i] == ComponentControl::StoreSrc)
1691 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1692 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1693 // if x or y, use vi128XY permute result, else use vi128ZW
1694 Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1695 Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1699 // extract 128 bit lanes to sign extend each component
1701 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1703 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1705 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1709 // extract 128 bit lanes to sign extend each component
1711 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1713 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1715 Value* temp = JOIN_16(temp_lo, temp_hi);
1717 // denormalize if needed
1718 if (conversionType != CONVERT_NONE)
1720 temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1723 vVertexElements[currentVertexElement] = temp;
1726 currentVertexElement += 1;
1730 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1733 if (currentVertexElement > 3)
1735 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1736 // reset to the next vVertexElement to output
1737 currentVertexElement = 0;
1743 else if ((extendType == Instruction::CastOps::ZExt) ||
1744 (extendType == Instruction::CastOps::UIToFP))
1746 // pshufb masks for each component
1747 Value* vConstMask[2];
1749 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1752 vConstMask[0] = C<char>({
1753 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1754 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1758 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1761 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1762 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1765 // init denormalize variables if needed
1766 Instruction::CastOps fpCast;
1767 Value* conversionFactor;
1769 switch (conversionType)
1771 case CONVERT_NORMALIZED:
1772 fpCast = Instruction::CastOps::UIToFP;
1773 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1775 case CONVERT_USCALED:
1776 fpCast = Instruction::CastOps::UIToFP;
1777 conversionFactor = VIMMED1((float)(1.0f));
1779 case CONVERT_SSCALED:
1780 SWR_INVALID("Type should not be zero extended!");
1781 conversionFactor = nullptr;
1784 SWR_ASSERT(conversionType == CONVERT_NONE);
1785 conversionFactor = nullptr;
1789 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1790 for (uint32_t i = 0; i < 4; i++)
1792 if (isComponentEnabled(compMask, i))
1794 if (compCtrl[i] == ComponentControl::StoreSrc)
1796 // select correct constMask for x/z or y/w pshufb
1797 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1798 // if x or y, use vi128XY permute result, else use vi128ZW
1799 uint32_t selectedGather = (i < 2) ? 0 : 1;
1801 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL,
1804 Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1805 Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1807 Value* temp_lo = BITCAST(
1808 PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1810 Value* temp_hi = BITCAST(
1811 PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1814 // after pshufb mask for x channel; z uses the same shuffle from the second
1815 // gather 256i - 0 1 2 3 4 5 6 7
1816 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1818 Value* temp = JOIN_16(temp_lo, temp_hi);
1820 // denormalize if needed
1821 if (conversionType != CONVERT_NONE)
1823 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1826 vVertexElements[currentVertexElement] = temp;
1828 currentVertexElement += 1;
1832 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1835 if (currentVertexElement > 3)
1837 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1838 // reset to the next vVertexElement to output
1839 currentVertexElement = 0;
1846 SWR_INVALID("Unsupported conversion type");
1850 void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1852 // Unpack tuple args
1853 Value*(&vGatherResult)[2] = std::get<0>(args);
1854 Value* pVtxOut = std::get<1>(args);
1855 const Instruction::CastOps extendType = std::get<2>(args);
1856 const ConversionType conversionType = std::get<3>(args);
1857 uint32_t& currentVertexElement = std::get<4>(args);
1858 uint32_t& outputElt = std::get<5>(args);
1859 const ComponentEnable compMask = std::get<6>(args);
1860 const ComponentControl(&compCtrl)[4] = std::get<7>(args);
1861 Value*(&vVertexElements)[4] = std::get<8>(args);
1864 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1865 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1867 // have to do extra work for sign extending
1868 if ((extendType == Instruction::CastOps::SExt) ||
1869 (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1871 // is this PP float?
1872 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1874 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1875 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
1876 mVWidth / 4); // vwidth is units of 32 bits
1879 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1880 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1881 Value* vi128XY = nullptr;
1882 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1884 Value* vShufResult =
1885 BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1886 // after pshufb: group components together in each 128bit lane
1887 // 256i - 0 1 2 3 4 5 6 7
1888 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1890 vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1891 // after PERMD: move and pack xy components into each 128bit lane
1892 // 256i - 0 1 2 3 4 5 6 7
1893 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1896 // do the same for zw components
1897 Value* vi128ZW = nullptr;
1898 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1900 Value* vShufResult =
1901 BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1902 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1905 // init denormalize variables if needed
1906 Instruction::CastOps IntToFpCast;
1907 Value* conversionFactor;
1909 switch (conversionType)
1911 case CONVERT_NORMALIZED:
1912 IntToFpCast = Instruction::CastOps::SIToFP;
1913 conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1915 case CONVERT_SSCALED:
1916 IntToFpCast = Instruction::CastOps::SIToFP;
1917 conversionFactor = VIMMED1((float)(1.0));
1919 case CONVERT_USCALED:
1920 SWR_INVALID("Type should not be sign extended!");
1921 conversionFactor = nullptr;
1924 SWR_ASSERT(conversionType == CONVERT_NONE);
1925 conversionFactor = nullptr;
1929 // sign extend all enabled components. If we have a fill vVertexElements, output to current
1931 for (uint32_t i = 0; i < 4; i++)
1933 if (isComponentEnabled(compMask, i))
1935 if (compCtrl[i] == ComponentControl::StoreSrc)
1937 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1938 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1939 // if x or y, use vi128XY permute result, else use vi128ZW
1940 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1944 // extract 128 bit lanes to sign extend each component
1945 vVertexElements[currentVertexElement] =
1946 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1950 // extract 128 bit lanes to sign extend each component
1951 vVertexElements[currentVertexElement] =
1952 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1954 // denormalize if needed
1955 if (conversionType != CONVERT_NONE)
1957 vVertexElements[currentVertexElement] =
1958 FMUL(CAST(IntToFpCast,
1959 vVertexElements[currentVertexElement],
1964 currentVertexElement++;
1968 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1971 if (currentVertexElement > 3)
1973 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1974 // reset to the next vVertexElement to output
1975 currentVertexElement = 0;
1981 else if ((extendType == Instruction::CastOps::ZExt) ||
1982 (extendType == Instruction::CastOps::UIToFP))
1984 // pshufb masks for each component
1985 Value* vConstMask[2];
1986 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1989 vConstMask[0] = C<char>({
1990 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1991 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1995 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1998 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1999 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2002 // init denormalize variables if needed
2003 Instruction::CastOps fpCast;
2004 Value* conversionFactor;
2006 switch (conversionType)
2008 case CONVERT_NORMALIZED:
2009 fpCast = Instruction::CastOps::UIToFP;
2010 conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2012 case CONVERT_USCALED:
2013 fpCast = Instruction::CastOps::UIToFP;
2014 conversionFactor = VIMMED1((float)(1.0f));
2016 case CONVERT_SSCALED:
2017 SWR_INVALID("Type should not be zero extended!");
2018 conversionFactor = nullptr;
2021 SWR_ASSERT(conversionType == CONVERT_NONE);
2022 conversionFactor = nullptr;
2026 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2027 for (uint32_t i = 0; i < 4; i++)
2029 if (isComponentEnabled(compMask, i))
2031 if (compCtrl[i] == ComponentControl::StoreSrc)
2033 // select correct constMask for x/z or y/w pshufb
2034 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2035 // if x or y, use vi128XY permute result, else use vi128ZW
2036 uint32_t selectedGather = (i < 2) ? 0 : 1;
2038 vVertexElements[currentVertexElement] =
2039 BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2040 vConstMask[selectedMask]),
2042 // after pshufb mask for x channel; z uses the same shuffle from the second
2043 // gather 256i - 0 1 2 3 4 5 6 7
2044 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2046 // denormalize if needed
2047 if (conversionType != CONVERT_NONE)
2049 vVertexElements[currentVertexElement] =
2050 FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2053 currentVertexElement++;
2057 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2060 if (currentVertexElement > 3)
2062 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2063 // reset to the next vVertexElement to output
2064 currentVertexElement = 0;
2071 SWR_INVALID("Unsupported conversion type");
2075 //////////////////////////////////////////////////////////////////////////
2076 /// @brief Output a simdvertex worth of elements to the current outputElt
2077 /// @param pVtxOut - base address of VIN output struct
2078 /// @param outputElt - simdvertex offset in VIN to write to
2079 /// @param numEltsToStore - number of simdvertex rows to write out
2080 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
2081 void FetchJit::StoreVertexElements(Value* pVtxOut,
2082 const uint32_t outputElt,
2083 const uint32_t numEltsToStore,
2084 Value* (&vVertexElements)[4])
2086 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2088 for (uint32_t c = 0; c < numEltsToStore; ++c)
2090 // STORE expects FP32 x vWidth type, just bitcast if needed
2091 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2093 #if FETCH_DUMP_VERTEX
2094 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2096 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2098 #if FETCH_DUMP_VERTEX
2101 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2104 // outputElt * 4 = offsetting by the size of a simdvertex
2105 // + c offsets to a 32bit x vWidth row within the current vertex
2106 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2107 STORE(vVertexElements[c], dest);
2111 //////////////////////////////////////////////////////////////////////////
2112 /// @brief Generates a constant vector of values based on the
2113 /// ComponentControl value
2114 /// @param ctrl - ComponentControl value
2115 Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2124 return VIMMED1(1.0f);
2131 Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8);
2133 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2135 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2136 return JOIN_16(pIdLo, pIdHi);
2140 return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2143 case StoreInstanceId:
2145 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2146 return VBROADCAST(pId);
2152 SWR_INVALID("Invalid component control");
2157 //////////////////////////////////////////////////////////////////////////
2158 /// @brief Returns the enable mask for the specified component.
2159 /// @param enableMask - enable bits
2160 /// @param component - component to check if enabled.
2161 bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2167 return (enableMask & ComponentEnable::X);
2170 return (enableMask & ComponentEnable::Y);
2173 return (enableMask & ComponentEnable::Z);
2176 return (enableMask & ComponentEnable::W);
2183 // Don't want two threads compiling the same fetch shader simultaneously
2184 // Has problems in the JIT cache implementation
2185 // This is only a problem for fetch right now.
2186 static std::mutex gFetchCodegenMutex;
2188 //////////////////////////////////////////////////////////////////////////
2189 /// @brief JITs from fetch shader IR
2190 /// @param hJitMgr - JitManager handle
2191 /// @param func - LLVM function IR
2192 /// @return PFN_FETCH_FUNC - pointer to fetch code
2193 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2195 const llvm::Function* func = (const llvm::Function*)hFunc;
2196 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2197 PFN_FETCH_FUNC pfnFetch;
2199 gFetchCodegenMutex.lock();
2200 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2201 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2202 // add new IR to the module
2203 pJitMgr->mIsModuleFinalized = true;
2205 #if defined(KNOB_SWRC_TRACING)
2207 const char* funcName = func->getName().data();
2208 sprintf(fName, "%s.bin", funcName);
2209 FILE* fd = fopen(fName, "wb");
2210 fwrite((void*)pfnFetch, 1, 2048, fd);
2214 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2215 gFetchCodegenMutex.unlock();
2221 //////////////////////////////////////////////////////////////////////////
2222 /// @brief JIT compiles fetch shader
2223 /// @param hJitMgr - JitManager handle
2224 /// @param state - fetch state to build function from
2225 extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2227 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2229 pJitMgr->SetupNewModule();
2231 FetchJit theJit(pJitMgr);
2232 HANDLE hFunc = theJit.Create(state);
2234 return JitFetchFunc(hJitMgr, hFunc);