class AMDGPUSubtarget;
class AMDGPUTargetMachine;
class FunctionPass;
+struct MachineSchedContext;
class MCAsmInfo;
class raw_ostream;
+class ScheduleDAGInstrs;
class Target;
class TargetMachine;
FunctionPass *createSILowerI1CopiesPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
-FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSILowerControlFlowPass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIFixSGPRLiveRangesPass();
FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
-FunctionPass *createSIInsertWaits(TargetMachine &tm);
+FunctionPass *createSIInsertWaitsPass();
+
+ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C);
ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
void initializeSILoadStoreOptimizerPass(PassRegistry &);
extern char &SILoadStoreOptimizerID;
+void initializeSILowerControlFlowPass(PassRegistry &);
+extern char &SILowerControlFlowPassID;
+
+
// Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
+FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
+void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
+extern char &AMDGPUPromoteAllocaID;
+
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
ModulePass *createAMDGPUAlwaysInlinePass();
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
extern char &AMDGPUAnnotateUniformValuesPassID;
+void initializeSIAnnotateControlFlowPass(PassRegistry&);
+extern char &SIAnnotateControlFlowPassID;
+
+void initializeSIInsertWaitsPass(PassRegistry&);
+extern char &SIInsertWaitsID;
+
extern Target TheAMDGPUTarget;
extern Target TheGCNTarget;
"true",
"Enable single precision denormal handling">;
-def Feature64BitPtr : SubtargetFeature<"64BitPtr",
- "Is64bit",
- "true",
- "Specify if 64-bit addressing should be used">;
-
def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
"R600ALUInst",
"false",
"true",
"Force using DS instruction immediate offsets on SI">;
+def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
+ "FlatForGlobal",
+ "true",
+ "Force to generate flat instruction for global">;
+
def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
"FlatAddressSpace",
"true",
"Support flat address space">;
+def FeatureXNACK : SubtargetFeature<"xnack",
+ "EnableXNACK",
+ "true",
+ "Enable XNACK support">;
+
def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
"EnableVGPRSpilling",
"true",
"true",
"Enable scratch buffer sizes greater than 128 GB">;
+def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
+ "EnableSIScheduler",
+ "true",
+ "Enable SI Machine Scheduler">;
+
class SubtargetFeatureFetchLimit <string Value> :
SubtargetFeature <"fetch"#Value,
"TexVTXClauseSize",
def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1>;
def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0>;
def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>;
+def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>;
class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
"localmemorysize"#Value,
"true",
"Additional intstructions for CI+">;
+class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
+ "max-private-element-size-"#size,
+ "MaxPrivateElementSize",
+ !cast<string>(size),
+ "Maximum private access size may be "#size
+>;
+
+def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
+def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
+def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
>;
def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
- [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768,
+ [FeatureFP64, FeatureLocalMemorySize32768,
FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding,
FeatureLDSBankCount32]>;
def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
- [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+ [FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace,
FeatureGCN1Encoding, FeatureCIInsts]>;
def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
- [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536,
+ [FeatureFP64, FeatureLocalMemorySize65536,
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
- FeatureGCN3Encoding, FeatureCIInsts, FeatureLDSBankCount32]>;
+ FeatureGCN3Encoding, FeatureCIInsts]>;
//===----------------------------------------------------------------------===//
"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS"
>, AssemblerPredicate<"FeatureGCN1Encoding">;
+def isVI : Predicate <
+ "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
+ AssemblerPredicate<"FeatureGCN3Encoding">;
+
class PredicateControl {
Predicate SubtargetPredicate;
Predicate SIAssemblerPredicate = isSICI;
+ Predicate VIAssemblerPredicate = isVI;
list<Predicate> AssemblerPredicates = [];
Predicate AssemblerPredicate = TruePredicate;
list<Predicate> OtherPredicates = [];
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {}
+void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+ return;
+
+ // Need to construct an MCSubtargetInfo here in case we have no functions
+ // in the module.
+ std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+ TM.getTargetTriple().str(), TM.getTargetCPU(),
+ TM.getTargetFeatureString()));
+
+ AMDGPUTargetStreamer *TS =
+ static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+ TS->EmitDirectiveHSACodeObjectVersion(1, 0);
+ AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
+ TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
+ "AMD", "AMDGPU");
+}
+
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
}
+ MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV));
const DataLayout &DL = getDataLayout();
+
+ // Emit the size
+ uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+ OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
OutStreamer->PushSection();
OutStreamer->SwitchSection(
getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
- MCSymbol *GVSym = getSymbol(GV);
const Constant *C = GV->getInitializer();
OutStreamer->EmitLabel(GVSym);
EmitGlobalConstant(DL, C);
if (!STM.isAmdHsaOS()) {
EmitProgramInfoSI(MF, KernelInfo);
}
- // Emit directives
- AMDGPUTargetStreamer *TS =
- static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
- TS->EmitDirectiveHSACodeObjectVersion(1, 0);
- AMDGPU::IsaVersion ISA = STM.getIsaVersion();
- TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
- "AMD", "AMDGPU");
} else {
EmitProgramInfoR600(MF);
}
unsigned reg = MO.getReg();
switch (reg) {
case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
case AMDGPU::SCC:
case AMDGPU::M0:
continue;
}
}
- if (VCCUsed || FlatUsed)
- MaxSGPR += 2;
+ unsigned ExtraSGPRs = 0;
- if (FlatUsed) {
- MaxSGPR += 2;
- // 2 additional for VI+.
- if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
- MaxSGPR += 2;
+ if (VCCUsed)
+ ExtraSGPRs = 2;
+
+ if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (FlatUsed)
+ ExtraSGPRs = 4;
+ } else {
+ if (STM.isXNACKEnabled())
+ ExtraSGPRs = 4;
+
+ if (FlatUsed)
+ ExtraSGPRs = 6;
}
+ MaxSGPR += ExtraSGPRs;
+
// We found the maximum register index. They start at 0, so add one to get the
// number of registers.
ProgInfo.NumVGPR = MaxVGPR + 1;
ProgInfo.DX10Clamp = 0;
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
- ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
+ ProgInfo.ScratchSize = FrameInfo->getStackSize();
ProgInfo.FlatUsed = FlatUsed;
ProgInfo.VCCUsed = VCCUsed;
OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
- OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
+ OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+ OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
+ OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+ }
+}
+
+// This is supposed to be log2(Size)
+static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
+ switch (Size) {
+ case 4:
+ return AMD_ELEMENT_4_BYTES;
+ case 8:
+ return AMD_ELEMENT_8_BYTES;
+ case 16:
+ return AMD_ELEMENT_16_BYTES;
+ default:
+ llvm_unreachable("invalid private_element_size");
}
}
(KernelInfo.ComputePGMRSrc2 << 32);
header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+
+ AMD_HSA_BITS_SET(header.code_properties,
+ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
+ getElementByteSizeValue(STM.getMaxPrivateElementSize()));
+
if (MFI->hasPrivateSegmentBuffer()) {
header.code_properties |=
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
if (MFI->hasDispatchPtr())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+ if (STM.isXNACKEnabled())
+ header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+
header.kernarg_segment_byte_size = MFI->ABIArgOffset;
header.wavefront_sgpr_count = KernelInfo.NumSGPR;
header.workitem_vgpr_count = KernelInfo.NumVGPR;
void EmitGlobalVariable(const GlobalVariable *GV) override;
+ void EmitStartOfAsmFile(Module &M) override;
+
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
unsigned AsmVariant, const char *ExtraCode,
raw_ostream &O) override;
CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
- SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
+ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+ SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
]>>>,
CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
- [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
- [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+ [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+ SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+ SGPR32, SGPR34, SGPR36, SGPR38 ],
+ [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+ SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+ SGPR33, SGPR35, SGPR37, SGPR39 ]
>>>,
+ // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
- VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+ VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+ VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+ VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+ VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+ VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+ VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+ VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+ VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+ VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+ VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+ VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+ VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+ VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
]>>>,
CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
- [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
- [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+ [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
+ SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
+ SGPR32, SGPR34, SGPR36, SGPR38 ],
+ [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
+ SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
+ SGPR33, SGPR35, SGPR37, SGPR39 ]
>>>
]>;
+def RetCC_SI : CallingConv<[
+ CCIfType<[i32] , CCAssignToReg<[
+ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+ SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+ SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
+ SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
+ SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
+ ]>>,
+
+ // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
+ CCIfType<[f32] , CCAssignToReg<[
+ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+ VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+ VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31,
+ VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39,
+ VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47,
+ VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55,
+ VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63,
+ VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71,
+ VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79,
+ VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87,
+ VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95,
+ VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103,
+ VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111,
+ VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
+ VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
+ VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
+ ]>>
+]>;
+
// Calling convention for R600
def CC_R600 : CallingConv<[
CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
SDValue &Offset1) const;
- void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
+ bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
SDValue &TFE) const;
return isUInt<12>(Imm->getZExtValue());
}
-void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64,
SDValue &GLC, SDValue &SLC,
SDValue &TFE) const {
+ // Subtarget prefers to use flat instruction
+ if (Subtarget->useFlatForGlobal())
+ return false;
+
SDLoc DL(Addr);
GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
if (isLegalMUBUFImmOffset(C1)) {
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
- return;
+ return true;
} else if (isUInt<32>(C1->getZExtValue())) {
// Illegal offset, store it in soffset.
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
0);
- return;
+ return true;
}
}
Ptr = N0;
VAddr = N1;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
- return;
+ return true;
}
// default case -> offset
VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
Ptr = Addr;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return false;
- SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE);
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE))
+ return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
if (C->getSExtValue()) {
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
- SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE);
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
+ GLC, SLC, TFE))
+ return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
!cast<ConstantSDNode>(Idxen)->getSExtValue() &&
setOperationAction(ISD::SMAX, MVT::i32, Legal);
setOperationAction(ISD::UMAX, MVT::i32, Legal);
- if (!Subtarget->hasFFBH())
+ if (Subtarget->hasFFBH())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+ else
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
if (!Subtarget->hasFFBL())
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+
+ setOperationAction(ISD::CTLZ, MVT::i64, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v4i32
};
State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
}
+void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
+ const SmallVectorImpl<ISD::OutputArg> &Outs) const {
+
+ State.AnalyzeReturn(Outs, RetCC_SI);
+}
+
SDValue AMDGPUTargetLowering::LowerReturn(
SDValue Chain,
CallingConv::ID CallConv,
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::CTLZ:
+ case ISD::CTLZ_ZERO_UNDEF:
+ return LowerCTLZ(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
}
return Op;
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
+SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+ bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
+
+ if (ZeroUndef && Src.getValueType() == MVT::i32)
+ return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
+
+ SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
+
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), MVT::i32);
+
+ SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
+
+ SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
+ SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
+
+ const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
+ SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
+
+ // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
+ SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
+
+ if (!ZeroUndef) {
+ // Test if the full 64-bit input is zero.
+
+ // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
+ // which we probably don't want.
+ SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
+ SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
+
+ // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
+ // with the same cycles, otherwise it is slower.
+ // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
+ // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
+
+ const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
+
+ // The instruction returns -1 for 0 input, but the defined intrinsic
+ // behavior is to return the number of bits.
+ NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
+ SrcIsZero, Bits32, NewCtlz);
+ }
+
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
+}
+
+SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
+ bool Signed) const {
+ // Unsigned
+ // cul2f(ulong u)
+ //{
+ // uint lz = clz(u);
+ // uint e = (u != 0) ? 127U + 63U - lz : 0;
+ // u = (u << lz) & 0x7fffffffffffffffUL;
+ // ulong t = u & 0xffffffffffUL;
+ // uint v = (e << 23) | (uint)(u >> 40);
+ // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
+ // return as_float(v + r);
+ //}
+ // Signed
+ // cl2f(long l)
+ //{
+ // long s = l >> 63;
+ // float r = cul2f((l + s) ^ s);
+ // return s ? -r : r;
+ //}
+
+ SDLoc SL(Op);
+ SDValue Src = Op.getOperand(0);
+ SDValue L = Src;
+
+ SDValue S;
+ if (Signed) {
+ const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
+ S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
+
+ SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
+ L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
+ }
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), MVT::f32);
+
+
+ SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
+ SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
+ SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
+ LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
+
+ SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
+ SDValue E = DAG.getSelect(SL, MVT::i32,
+ DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
+ DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
+ ZeroI32);
+
+ SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
+ DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
+ DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
+
+ SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
+ DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
+
+ SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
+ U, DAG.getConstant(40, SL, MVT::i64));
+
+ SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
+ DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
+ DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));
+
+ SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
+ SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
+ SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
+
+ SDValue One = DAG.getConstant(1, SL, MVT::i32);
+
+ SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
+
+ SDValue R = DAG.getSelect(SL, MVT::i32,
+ RCmp,
+ One,
+ DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
+ R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
+ R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
+
+ if (!Signed)
+ return R;
+
+ SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
+ return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
+}
+
SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
SDLoc SL(Op);
SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
- SDValue S0 = Op.getOperand(0);
- if (S0.getValueType() != MVT::i64)
- return SDValue();
+ assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+ "operation should be legal");
EVT DestVT = Op.getValueType();
if (DestVT == MVT::f64)
return LowerINT_TO_FP64(Op, DAG, false);
- assert(DestVT == MVT::f32);
-
- SDLoc DL(Op);
+ if (DestVT == MVT::f32)
+ return LowerINT_TO_FP32(Op, DAG, false);
- // f32 uint_to_fp i64
- SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
- DAG.getConstant(0, DL, MVT::i32));
- SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
- SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
- DAG.getConstant(1, DL, MVT::i32));
- SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
- // TODO: Should this propagate fast-math-flags?
- FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
- DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32
- return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
+ return SDValue();
}
SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
- SDValue Src = Op.getOperand(0);
- if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64)
+ assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+ "operation should be legal");
+
+ EVT DestVT = Op.getValueType();
+ if (DestVT == MVT::f32)
+ return LowerINT_TO_FP32(Op, DAG, true);
+
+ if (DestVT == MVT::f64)
return LowerINT_TO_FP64(Op, DAG, true);
return SDValue();
return DAG.getSExtOrTrunc(Mul, DL, VT);
}
+static bool isNegativeOne(SDValue Val) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
+ return C->isAllOnesValue();
+ return false;
+}
+
+static bool isCtlzOpc(unsigned Opc) {
+ return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
+}
+
+// Get FFBH node if the incoming op may have been type legalized from a smaller
+// type VT.
+// Need to match pre-legalized type because the generic legalization inserts the
+// add/sub between the select and compare.
+static SDValue getFFBH_U32(const TargetLowering &TLI,
+ SelectionDAG &DAG, SDLoc SL, SDValue Op) {
+ EVT VT = Op.getValueType();
+ EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ if (LegalVT != MVT::i32)
+ return SDValue();
+
+ if (VT != MVT::i32)
+ Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op);
+
+ SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op);
+ if (VT != MVT::i32)
+ FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH);
+
+ return FFBH;
+}
+
+// The native instructions return -1 on 0 input. Optimize out a select that
+// produces -1 on 0.
+//
+// TODO: If zero is not undef, we could also do this if the output is compared
+// against the bitwidth.
+//
+// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
+SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL,
+ SDValue Cond,
+ SDValue LHS,
+ SDValue RHS,
+ DAGCombinerInfo &DCI) const {
+ ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+ if (!CmpRhs || !CmpRhs->isNullValue())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+ SDValue CmpLHS = Cond.getOperand(0);
+
+ // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
+ if (CCOpcode == ISD::SETEQ &&
+ isCtlzOpc(RHS.getOpcode()) &&
+ RHS.getOperand(0) == CmpLHS &&
+ isNegativeOne(LHS)) {
+ return getFFBH_U32(*this, DAG, SL, CmpLHS);
+ }
+
+ // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
+ if (CCOpcode == ISD::SETNE &&
+ isCtlzOpc(LHS.getOpcode()) &&
+ LHS.getOperand(0) == CmpLHS &&
+ isNegativeOne(RHS)) {
+ return getFFBH_U32(*this, DAG, SL, CmpLHS);
+ }
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue Cond = N->getOperand(0);
+ if (Cond.getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ SDValue CC = Cond.getOperand(2);
+
+ SDValue True = N->getOperand(1);
+ SDValue False = N->getOperand(2);
+
+ if (VT == MVT::f32 && Cond.hasOneUse())
+ return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+
+ // There's no reason to not do this if the condition has other uses.
+ return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
simplifyI24(N1, DCI);
return SDValue();
}
- case ISD::SELECT: {
- SDValue Cond = N->getOperand(0);
- if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) {
- EVT VT = N->getValueType(0);
- SDValue LHS = Cond.getOperand(0);
- SDValue RHS = Cond.getOperand(1);
- SDValue CC = Cond.getOperand(2);
-
- SDValue True = N->getOperand(1);
- SDValue False = N->getOperand(2);
-
- if (VT == MVT::f32)
- return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
- }
-
- break;
- }
+ case ISD::SELECT:
+ return performSelectCombine(N, DCI);
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
assert(!N->getValueType(0).isVector() &&
NODE_NAME_CASE(BFE_I32)
NODE_NAME_CASE(BFI)
NODE_NAME_CASE(BFM)
+ NODE_NAME_CASE(FFBH_U32)
NODE_NAME_CASE(MUL_U24)
NODE_NAME_CASE(MUL_I24)
NODE_NAME_CASE(MAD_U24)
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS,
+ DAGCombinerInfo &DCI) const;
+ SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
protected:
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
SmallVectorImpl<ISD::InputArg> &OrigIns) const;
void AnalyzeFormalArguments(CCState &State,
const SmallVectorImpl<ISD::InputArg> &Ins) const;
+ void AnalyzeReturn(CCState &State,
+ const SmallVectorImpl<ISD::OutputArg> &Outs) const;
public:
AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI);
BFE_I32, // Extract range of bits with sign extension to 32-bits.
BFI, // (src0 & src1) | (~src0 & src2)
BFM, // Insert a range of bits into a 32-bit word.
+ FFBH_U32, // ctlz with -1 if input is zero.
MUL_U24,
MUL_I24,
MAD_U24,
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
+def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
+
// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
// performing the mulitply. The result is a 32-bit value.
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
// Call/Return DAG Nodes
//===----------------------------------------------------------------------===//
def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
- [SDNPHasChain, SDNPOptInGlue]>;
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
-def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
}]>;
return isGlobalLoad(dyn_cast<LoadSDNode>(N));
}]>;
-def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
}]>;
return isGlobalStore(dyn_cast<StoreSDNode>(N));
}]>;
-def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
- (truncstorei8 node:$val, node:$ptr), [{
- return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
- (truncstorei16 node:$val, node:$ptr), [{
- return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
def local_store : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
return isLocalStore(dyn_cast<StoreSDNode>(N));
defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;
-def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def flat_store : PatFrag<(ops node:$val, node:$ptr),
- (store node:$val, node:$ptr), [{
- return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUstore_mskor node:$val, node:$ptr), [{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>();
AMDGPUMCInstLower MCInstLowering(OutContext, STI);
-#ifdef _DEBUG
StringRef Err;
if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) {
- errs() << "Warning: Illegal instruction detected: " << Err << "\n";
+ LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
+ C.emitError("Illegal instruction detected: " + Err);
MI->dump();
}
-#endif
+
if (MI->isBundle()) {
const MachineBasicBlock *MBB = MI->getParent();
MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
namespace {
class AMDGPUPromoteAlloca : public FunctionPass,
- public InstVisitor<AMDGPUPromoteAlloca> {
-
- static char ID;
+ public InstVisitor<AMDGPUPromoteAlloca> {
+private:
+ const TargetMachine *TM;
Module *Mod;
- const AMDGPUSubtarget &ST;
int LocalMemAvailable;
public:
- AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
- LocalMemAvailable(0) { }
+ static char ID;
+
+ AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
+ FunctionPass(ID),
+ TM (TM_),
+ Mod(nullptr),
+ LocalMemAvailable(0) { }
+
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
char AMDGPUPromoteAlloca::ID = 0;
+INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
+ "AMDGPU promote alloca to vector or LDS", false, false)
+
+char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+ if (!TM)
+ return false;
+
Mod = &M;
return false;
}
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+ if (!TM)
+ return false;
+
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
FunctionType *FTy = F.getFunctionType();
return true;
}
+static bool isCallPromotable(CallInst *CI) {
+ // TODO: We might be able to handle some cases where the callee is a
+ // constantexpr bitcast of a function.
+ if (!CI->getCalledFunction())
+ return false;
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+ if (!II)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ case Intrinsic::memmove:
+ case Intrinsic::memset:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::invariant_start:
+ case Intrinsic::invariant_end:
+ case Intrinsic::invariant_group_barrier:
+ case Intrinsic::objectsize:
+ return true;
+ default:
+ return false;
+ }
+}
+
static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
- bool Success = true;
for (User *User : Val->users()) {
- if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+ if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
continue;
+
if (CallInst *CI = dyn_cast<CallInst>(User)) {
- // TODO: We might be able to handle some cases where the callee is a
- // constantexpr bitcast of a function.
- if (!CI->getCalledFunction())
+ if (!isCallPromotable(CI))
return false;
WorkList.push_back(User);
return false;
if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
+ if (SI->isVolatile())
+ return false;
+
// Reject if the stored value is not the pointer operand.
if (SI->getPointerOperand() != Val)
return false;
+ } else if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
+ if (LI->isVolatile())
+ return false;
+ } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
+ if (RMW->isVolatile())
+ return false;
+ } else if (AtomicCmpXchgInst *CAS
+ = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
+ if (CAS->isVolatile())
+ return false;
}
if (!User->getType()->isPointerTy())
continue;
WorkList.push_back(User);
-
- Success &= collectUsesWithPtrTypes(User, WorkList);
+ if (!collectUsesWithPtrTypes(User, WorkList))
+ return false;
}
- return Success;
+
+ return true;
}
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
- if (!I.isStaticAlloca())
+ // Array allocations are probably not worth handling, since an allocation of
+ // the array type is the canonical form.
+ if (!I.isStaticAlloca() || I.isArrayAllocation())
return;
IRBuilder<> Builder(&I);
DEBUG(dbgs() << "Promoting alloca to local memory\n");
LocalMemAvailable -= AllocaSize;
+ Function *F = I.getParent()->getParent();
+
Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
GlobalVariable *GV = new GlobalVariable(
- *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
- GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+ *Mod, GVTy, false, GlobalValue::InternalLinkage,
+ UndefValue::get(GVTy),
+ Twine(F->getName()) + Twine('.') + I.getName(),
+ nullptr,
+ GlobalVariable::NotThreadLocal,
+ AMDGPUAS::LOCAL_ADDRESS);
+ GV->setUnnamedAddr(true);
+ GV->setAlignment(I.getAlignment());
FunctionType *FTy = FunctionType::get(
Type::getInt32Ty(Mod->getContext()), false);
IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
if (!Intr) {
+ // FIXME: What is this for? It doesn't make sense to promote arbitrary
+ // function calls. If the call is to a defined function that can also be
+ // promoted, we should be able to do this once that function is also
+ // rewritten.
+
std::vector<Type*> ArgTypes;
for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
ArgIdx != ArgEnd; ++ArgIdx) {
Intr->eraseFromParent();
continue;
}
+ case Intrinsic::memmove: {
+ MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
+ Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
+ MemMove->getLength(), MemMove->getAlignment(),
+ MemMove->isVolatile());
+ Intr->eraseFromParent();
+ continue;
+ }
case Intrinsic::memset: {
MemSetInst *MemSet = cast<MemSetInst>(Intr);
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
Intr->eraseFromParent();
continue;
}
+ case Intrinsic::invariant_start:
+ case Intrinsic::invariant_end:
+ case Intrinsic::invariant_group_barrier:
+ Intr->eraseFromParent();
+ // FIXME: I think the invariant marker should still theoretically apply,
+ // but the intrinsics need to be changed to accept pointers with any
+ // address space.
+ continue;
+ case Intrinsic::objectsize: {
+ Value *Src = Intr->getOperand(0);
+ Type *SrcTy = Src->getType()->getPointerElementType();
+ Function *ObjectSize = Intrinsic::getDeclaration(Mod,
+ Intrinsic::objectsize,
+ { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
+ );
+
+ CallInst *NewCall
+ = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
+ Intr->replaceAllUsesWith(NewCall);
+ Intr->eraseFromParent();
+ continue;
+ }
default:
Intr->dump();
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}
-FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
- return new AMDGPUPromoteAlloca(ST);
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
+ return new AMDGPUPromoteAlloca(TM);
}
// disable it.
SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+ if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
+ FullFS += "+flat-for-global,";
FullFS += FS;
- if (GPU == "" && TT.getArch() == Triple::amdgcn)
- GPU = "SI";
-
ParseSubtargetFeatures(GPU, FullFS);
// FIXME: I don't think think Evergreen has any useful support for
FP32Denormals = false;
FP64Denormals = false;
}
+
+ // Set defaults if needed.
+ if (MaxPrivateElementSize == 0)
+ MaxPrivateElementSize = 16;
+
return *this;
}
AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
TargetMachine &TM)
- : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false),
+ : AMDGPUGenSubtargetInfo(TT, GPU, FS),
DumpCode(false), R600ALUInst(false), HasVertexCache(false),
TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false),
FP64Denormals(false), FP32Denormals(false), FastFMAF32(false),
- CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true),
- EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false),
- EnableUnsafeDSOffsetFolding(false),
+ CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
+ EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
+ EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
+ EnableXNACK(false),
WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
+ MaxPrivateElementSize(0),
EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
- FrameLowering(nullptr),
+ EnableSIScheduler(false), FrameLowering(nullptr),
InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
initializeSubtargetDependencies(TT, GPU, FS);
ISAVersion7_0_0,
ISAVersion7_0_1,
ISAVersion8_0_0,
- ISAVersion8_0_1
+ ISAVersion8_0_1,
+ ISAVersion8_0_3
};
private:
- std::string DevName;
- bool Is64bit;
bool DumpCode;
bool R600ALUInst;
bool HasVertexCache;
bool FastFMAF32;
bool CaymanISA;
bool FlatAddressSpace;
+ bool FlatForGlobal;
bool EnableIRStructurizer;
bool EnablePromoteAlloca;
bool EnableIfCvt;
bool EnableLoadStoreOpt;
bool EnableUnsafeDSOffsetFolding;
+ bool EnableXNACK;
unsigned WavefrontSize;
bool CFALUBug;
int LocalMemorySize;
+ unsigned MaxPrivateElementSize;
bool EnableVGPRSpilling;
bool SGPRInitBug;
bool IsGCN;
int LDSBankCount;
unsigned IsaVersion;
bool EnableHugeScratchBuffer;
+ bool EnableSIScheduler;
std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
std::unique_ptr<AMDGPUTargetLowering> TLInfo;
void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
- bool is64bit() const {
- return Is64bit;
- }
-
bool hasVertexCache() const {
return HasVertexCache;
}
return FlatAddressSpace;
}
+ bool useFlatForGlobal() const {
+ return FlatForGlobal;
+ }
+
bool hasBFE() const {
return (getGeneration() >= EVERGREEN);
}
return LocalMemorySize;
}
+ unsigned getMaxPrivateElementSize() const {
+ return MaxPrivateElementSize;
+ }
+
bool hasSGPRInitBug() const {
return SGPRInitBug;
}
return false;
}
- StringRef getDeviceName() const {
- return DevName;
- }
-
bool enableHugeScratchBuffer() const {
return EnableHugeScratchBuffer;
}
+ bool enableSIScheduler() const {
+ return EnableSIScheduler;
+ }
+
bool dumpCode() const {
return DumpCode;
}
}
bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;
+ bool isXNACKEnabled() const {
+ return EnableXNACK;
+ }
+
unsigned getMaxWavesPerCU() const {
if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
return 10;
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
+ initializeAMDGPUPromoteAllocaPass(*PR);
+ initializeSIAnnotateControlFlowPass(*PR);
+ initializeSIInsertWaitsPass(*PR);
+ initializeSILowerControlFlowPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
}
static MachineSchedRegistry
-SchedCustomRegistry("r600", "Run R600's custom scheduler",
- createR600MachineScheduler);
+R600SchedRegistry("r600", "Run R600's custom scheduler",
+ createR600MachineScheduler);
+
+static MachineSchedRegistry
+SISchedRegistry("si", "Run SI's custom scheduler",
+ createSIMachineScheduler);
static std::string computeDataLayout(const Triple &TT) {
std::string Ret = "e-p:32:32";
return Ret;
}
+LLVM_READNONE
+static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
+ if (!GPU.empty())
+ return GPU;
+
+ // HSA only supports CI+, so change the default GPU to a CI for HSA.
+ if (TT.getArch() == Triple::amdgcn)
+ return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
+
+ return "";
+}
+
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
TargetOptions Options, Reloc::Model RM,
CodeModel::Model CM,
CodeGenOpt::Level OptLevel)
- : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT,
+ getGPUOrDefault(TT, CPU), FS, Options, RM, CM,
OptLevel),
- TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this),
+ TLOF(createTLOF(getTargetTriple())),
+ Subtarget(TT, getTargetCPU(), FS, *this),
IntrinsicInfo() {
setRequiresStructuredCFG(true);
initAsmInfo();
const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
return createR600MachineScheduler(C);
+ else if (ST.enableSIScheduler())
+ return createSIMachineScheduler(C);
return nullptr;
}
void AMDGPUPassConfig::addCodeGenPrepare() {
const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
if (ST.isPromoteAllocaEnabled()) {
- addPass(createAMDGPUPromoteAlloca(ST));
+ addPass(createAMDGPUPromoteAlloca(TM));
addPass(createSROAPass());
}
TargetPassConfig::addCodeGenPrepare();
}
void GCNPassConfig::addPreEmitPass() {
- addPass(createSIInsertWaits(*TM), false);
- addPass(createSILowerControlFlowPass(*TM), false);
+ addPass(createSIInsertWaitsPass(), false);
+ addPass(createSILowerControlFlowPass(), false);
}
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; }
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
+ return Vector ? 0 : 32;
+}
unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
// Semi-arbitrary large amount.
return false;
case AMDGPUIntrinsic::SI_tid:
case AMDGPUIntrinsic::SI_fs_interp:
+ case AMDGPUIntrinsic::SI_fs_constant:
return true;
}
}
AMD_CODE_VERSION_MINOR = 1
};
+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val) \
+ dst &= (~(1 << mask ## _SHIFT) & ~mask); \
+ dst |= (((val) << mask ## _SHIFT) & mask)
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask) \
+ ((src & mask) >> mask ## _SHIFT) \
+
/// The values used to define the number of bytes to use for the
/// swizzle element size.
enum amd_element_byte_size_t {
CntMask = 0x7;
CntShift = 4;
} else if (CntName == "lgkmcnt") {
- CntMask = 0x7;
+ CntMask = 0xf;
CntShift = 8;
} else {
return true;
// Disable all counters by default.
// vmcnt [3:0]
// expcnt [6:4]
- // lgkmcnt [10:8]
- int64_t CntVal = 0x77f;
+ // lgkmcnt [11:8]
+ int64_t CntVal = 0xf7f;
SMLoc S = Parser.getTok().getLoc();
switch(getLexer().getKind()) {
// Flat Instructions
//===----------------------------------------------------------------------===//
-def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>;
-def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>;
-def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>;
-def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>;
-def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>;
-def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>;
-def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>;
-def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>;
-def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>;
-def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>;
-def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>;
-def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
- 0x1d, "flat_store_dwordx2", VReg_64
->;
-def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
- 0x1e, "flat_store_dwordx4", VReg_128
->;
-def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
- 0x1f, "flat_store_dwordx3", VReg_96
->;
-defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>;
+defm FLAT_LOAD_UBYTE : FLAT_Load_Helper <
+ flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32
+>;
+defm FLAT_LOAD_SBYTE : FLAT_Load_Helper <
+ flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32
+>;
+defm FLAT_LOAD_USHORT : FLAT_Load_Helper <
+ flat<0xa, 0x12>, "flat_load_ushort", VGPR_32
+>;
+defm FLAT_LOAD_SSHORT : FLAT_Load_Helper <
+ flat<0xb, 0x13>, "flat_load_sshort", VGPR_32>
+;
+defm FLAT_LOAD_DWORD : FLAT_Load_Helper <
+ flat<0xc, 0x14>, "flat_load_dword", VGPR_32
+>;
+defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <
+ flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64
+>;
+defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <
+ flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128
+>;
+defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <
+ flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96
+>;
+defm FLAT_STORE_BYTE : FLAT_Store_Helper <
+ flat<0x18>, "flat_store_byte", VGPR_32
+>;
+defm FLAT_STORE_SHORT : FLAT_Store_Helper <
+ flat <0x1a>, "flat_store_short", VGPR_32
+>;
+defm FLAT_STORE_DWORD : FLAT_Store_Helper <
+ flat<0x1c>, "flat_store_dword", VGPR_32
+>;
+defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+ flat<0x1d>, "flat_store_dwordx2", VReg_64
+>;
+defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+ flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128
+>;
+defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+ flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96
+>;
+defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <
+ flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32
+>;
defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC <
- 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64
->;
-defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>;
-defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>;
-defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>;
-defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>;
-defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>;
-defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>;
-defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>;
-defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>;
-defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>;
-defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>;
-defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>;
-defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>;
-defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
- 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+ flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <
+ flat<0x32, 0x42>, "flat_atomic_add", VGPR_32
+>;
+defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <
+ flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32
+>;
+defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <
+ flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32
+>;
+defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <
+ flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32
+>;
+defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <
+ flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32
+>;
+defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <
+ flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32
+>;
+defm FLAT_ATOMIC_AND : FLAT_ATOMIC <
+ flat<0x39, 0x48>, "flat_atomic_and", VGPR_32
+>;
+defm FLAT_ATOMIC_OR : FLAT_ATOMIC <
+ flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32
+>;
+defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <
+ flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32
+>;
+defm FLAT_ATOMIC_INC : FLAT_ATOMIC <
+ flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32
+>;
+defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <
+ flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32
+>;
+defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <
+ flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64
>;
-defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>;
-defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>;
-defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>;
defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC <
- 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
->;
-defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>;
-defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>;
-defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>;
-defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>;
-defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>;
-defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>;
-defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>;
-defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>;
-defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>;
-defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>;
-defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>;
-defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>;
-defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
- 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+ flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <
+ flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <
+ flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <
+ flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <
+ flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <
+ flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <
+ flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64
+>;
+defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <
+ flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64
+>;
+defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <
+ flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64
+>;
+defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <
+ flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64
+>;
+defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <
+ flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64
+>;
+defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <
+ flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64
>;
-defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>;
-defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>;
} // End SubtargetPredicate = isCIVI
-//===----------------------------------------------------------------------===//
-// Flat Patterns
-//===----------------------------------------------------------------------===//
+// CI Only flat instructions
-let Predicates = [HasFlatAddressSpace] in {
+let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in {
-class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
- PatFrag flat_ld> :
- Pat <(vt (flat_ld i64:$ptr)),
- (Instr_ADDR64 $ptr, 0, 0, 0)
+defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC <
+ flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64
+>;
+defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <
+ flat<0x3f>, "flat_atomic_fmin", VGPR_32
+>;
+defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <
+ flat<0x40>, "flat_atomic_fmax", VGPR_32
+>;
+defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC <
+ flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128
+>;
+defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <
+ flat<0x5f>, "flat_atomic_fmin_x2", VReg_64
+>;
+defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <
+ flat<0x60>, "flat_atomic_fmax_x2", VReg_64
>;
-def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
-
-class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
- Pat <(st vt:$value, i64:$ptr),
- (Instr $value, $ptr, 0, 0, 0)
- >;
-
-def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
-def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
-def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
-
-} // End HasFlatAddressSpace predicate
+} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst
let Predicates = [isCI] in {
>;
} // End Predicates = [isCI]
+
+
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isCIVI] in {
+
+// Patterns for global loads with no offset
+class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr)),
+ (inst $addr, 0, 0, 0)
+>;
+
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;
+
+class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (node vt:$data, i64:$addr),
+ (inst $data, $addr, 0, 0, 0)
+>;
+
+def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;
+
+class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
+ (vt (node i64:$addr, vt:$data)),
+ (inst $addr, $data, 0, 0)
+>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
+
+} // End Predicates = [isCIVI]
SILowerControlFlow.cpp
SILowerI1Copies.cpp
SIMachineFunctionInfo.cpp
+ SIMachineScheduler.cpp
SIRegisterInfo.cpp
SIShrinkInstructions.cpp
SITypeRewriter.cpp
def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
-def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>;
+def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>;
let hasSideEffects = 1 in {
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
using namespace llvm;
void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
- // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
- // SIInsertWaits.cpp bits usage does not match ISA docs description but it
- // works so it might be a misprint in docs.
unsigned SImm16 = MI->getOperand(OpNo).getImm();
unsigned Vmcnt = SImm16 & 0xF;
- unsigned Expcnt = (SImm16 >> 4) & 0xF;
+ unsigned Expcnt = (SImm16 >> 4) & 0x7;
unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
bool NeedSpace = false;
NeedSpace = true;
}
- if (Lgkmcnt != 0x7) {
+ if (Lgkmcnt != 0xF) {
if (NeedSpace)
O << ' ';
O << "lgkmcnt(" << Lgkmcnt << ')';
#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H
-#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/Support/raw_ostream.h"
namespace llvm {
//===--- Global Variable Emission Directives --------------------------===//
HasAggressiveSymbolFolding = true;
COMMDirectiveAlignmentIsInBytes = false;
- HasDotTypeDotSizeDirective = false;
HasNoDeadStrip = true;
WeakRefDirective = ".weakref\t";
//===--- Dwarf Emission Directives -----------------------------------===//
//===----------------------------------------------------------------------===//
def : ProcessorModel<"tonga", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
+ [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
+ FeatureLDSBankCount32]
>;
def : ProcessorModel<"iceland", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0]
+ [FeatureVolcanicIslands, FeatureSGPRInitBug, FeatureISAVersion8_0_0,
+ FeatureLDSBankCount32]
>;
def : ProcessorModel<"carrizo", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+ [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32]
>;
def : ProcessorModel<"fiji", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+ [FeatureVolcanicIslands, FeatureISAVersion8_0_3, FeatureLDSBankCount32]
>;
def : ProcessorModel<"stoney", SIQuarterSpeedModel,
- [FeatureVolcanicIslands, FeatureISAVersion8_0_1]
+ [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16]
>;
class SIAnnotateControlFlow : public FunctionPass {
- static char ID;
-
Type *Boolean;
Type *Void;
Type *Int64;
void insertElse(BranchInst *Term);
- Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L);
+ Value *handleLoopCondition(Value *Cond, PHINode *Broken,
+ llvm::Loop *L, BranchInst *Term);
void handleLoop(BranchInst *Term);
void closeControlFlow(BasicBlock *BB);
public:
+ static char ID;
+
SIAnnotateControlFlow():
FunctionPass(ID) { }
} // end anonymous namespace
+INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
+ "Annotate SI Control Flow", false, false)
+INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
+ "Annotate SI Control Flow", false, false)
+
char SIAnnotateControlFlow::ID = 0;
/// \brief Initialize all the types and constants used in the pass
/// \brief Recursively handle the condition leading to a loop
Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
- llvm::Loop *L) {
+ llvm::Loop *L, BranchInst *Term) {
// Only search through PHI nodes which are inside the loop. If we try this
// with PHI nodes that are outside of the loop, we end up inserting new PHI
}
Phi->setIncomingValue(i, BoolFalse);
- Value *PhiArg = handleLoopCondition(Incoming, Broken, L);
+ Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term);
NewPhi->addIncoming(PhiArg, From);
}
BasicBlock *From = Phi->getIncomingBlock(i);
if (From == IDom) {
+ // We're in the following situation:
+ // IDom/From
+ // | \
+ // | If-block
+ // | /
+ // Parent
+ // where we want to break out of the loop if the If-block is not taken.
+ // Due to the depth-first traversal, there should be an end.cf
+ // intrinsic in Parent, and we insert an else.break before it.
+ //
+ // Note that the end.cf need not be the first non-phi instruction
+ // of parent, particularly when we're dealing with a multi-level
+ // break, but it should occur within a group of intrinsic calls
+ // at the beginning of the block.
CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+ while (OldEnd && OldEnd->getCalledFunction() != EndCf)
+ OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
Value *Args[] = { Cond, Broken };
return CallInst::Create(IfBreak, Args, "", Insert);
+ // Insert IfBreak before TERM for constant COND.
+ } else if (isa<ConstantInt>(Cond)) {
+ Value *Args[] = { Cond, Broken };
+ return CallInst::Create(IfBreak, Args, "", Term);
+
} else {
llvm_unreachable("Unhandled loop condition!");
}
Value *Cond = Term->getCondition();
Term->setCondition(BoolTrue);
- Value *Arg = handleLoopCondition(Cond, Broken, L);
+ Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
PI != PE; ++PI) {
#define C_00B84C_EXCP_EN
#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
-
+#define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0
#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0)
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
unsigned SrcReg = MI.getOperand(I).getReg();
- unsigned SrcSubReg = MI.getOperand(I).getReg();
+ unsigned SrcSubReg = MI.getOperand(I).getSubReg();
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
assert(TRI->isSGPRClass(SrcRC) &&
!MRI.hasOneUse(MI.getOperand(0).getReg()))
continue;
- // FIXME: Fold operands with subregs.
if (OpToFold.isReg() &&
- (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) ||
- OpToFold.getSubReg()))
+ !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
continue;
+ // Prevent folding operands backwards in the function. For example,
+ // the COPY opcode must not be replaced by 1 in this example:
+ //
+ // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
+ // ...
+ // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
+ MachineOperand &Dst = MI.getOperand(0);
+ if (Dst.isReg() &&
+ !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ continue;
// We need mutate the operands of new mov instructions to add implicit
// uses of EXEC, but adding them invalidates the use_iterator, so defer
static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
const MachineFrameInfo *FrameInfo) {
- if (!FuncInfo->hasSpilledSGPRs())
- return false;
-
- if (FuncInfo->hasSpilledVGPRs())
- return false;
-
- for (int I = FrameInfo->getObjectIndexBegin(),
- E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
- if (!FrameInfo->isSpillSlotObjectIndex(I))
- return false;
- }
-
- return true;
+ return FuncInfo->hasSpilledSGPRs() &&
+ (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
}
static ArrayRef<MCPhysReg> getAllSGPR128() {
static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineBasicBlock::iterator I = MBB.begin();
// We need to insert initialization of the scratch resource descriptor.
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
+ if (MFI->hasFlatScratchInit()) {
+ // We don't need this if we only have spills since there is no user facing
+ // scratch.
+
+ // TODO: If we know we don't have flat instructions earlier, we can omit
+ // this from the input registers.
+ //
+ // TODO: We only need to know if we access scratch space through a flat
+ // pointer. Because we only detect if flat instructions are used at all,
+ // this will be used more often than necessary on VI.
+
+ DebugLoc DL;
+
+ unsigned FlatScratchInitReg
+ = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+
+ MRI.addLiveIn(FlatScratchInitReg);
+ MBB.addLiveIn(FlatScratchInitReg);
+
+ // Copy the size in bytes.
+ unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
+ .addReg(FlatScrInitHi, RegState::Kill);
+
+ unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+
+ // Add wave offset in bytes to private base offset.
+ // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+ .addReg(FlatScrInitLo)
+ .addReg(ScratchWaveOffsetReg);
+
+ // Convert offset to 256-byte units.
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
+ .addReg(FlatScrInitLo, RegState::Kill)
+ .addImm(8);
+ }
+
// If we reserved the original input registers, we don't need to copy to the
// reserved registers.
if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
- MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
- // We reserved the last registers for this. Shift it down to the end of those
- // which were actually used.
- //
- // FIXME: It might be safer to use a pseudoregister before replacement.
-
- // FIXME: We should be able to eliminate unused input registers. We only
- // cannot do this for the resources required for scratch access. For now we
- // skip over user SGPRs and may leave unused holes.
-
- // We find the resource first because it has an alignment requirement.
- if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
- // Skip the last 2 elements because the last one is reserved for VCC, and
- // this is the 2nd to last element already.
- for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
- // Pick the first unallocated one. Make sure we don't clobber the other
- // reserved input we needed.
- if (!MRI.isPhysRegUsed(Reg)) {
- assert(MRI.isAllocatable(Reg));
- MRI.replaceRegWith(ScratchRsrcReg, Reg);
- ScratchRsrcReg = Reg;
- MFI->setScratchRSrcReg(ScratchRsrcReg);
- break;
+ if (!ST.hasSGPRInitBug()) {
+ // We reserved the last registers for this. Shift it down to the end of those
+ // which were actually used.
+ //
+ // FIXME: It might be safer to use a pseudoregister before replacement.
+
+ // FIXME: We should be able to eliminate unused input registers. We only
+ // cannot do this for the resources required for scratch access. For now we
+ // skip over user SGPRs and may leave unused holes.
+
+ // We find the resource first because it has an alignment requirement.
+ if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+ // Skip the last 2 elements because the last one is reserved for VCC, and
+ // this is the 2nd to last element already.
+ for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+ // Pick the first unallocated one. Make sure we don't clobber the other
+ // reserved input we needed.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg));
+ MRI.replaceRegWith(ScratchRsrcReg, Reg);
+ ScratchRsrcReg = Reg;
+ MFI->setScratchRSrcReg(ScratchRsrcReg);
+ break;
+ }
}
}
- }
- if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- // Skip the last 2 elements because the last one is reserved for VCC, and
- // this is the 2nd to last element already.
- unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
- for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
- // Pick the first unallocated SGPR. Be careful not to pick an alias of the
- // scratch descriptor, since we haven’t added its uses yet.
- if (!MRI.isPhysRegUsed(Reg)) {
- assert(MRI.isAllocatable(Reg) &&
- !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
-
- MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
- ScratchWaveOffsetReg = Reg;
- MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
- break;
+ if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+
+ // We need to drop register from the end of the list that we cannot use
+ // for the scratch wave offset.
+ // + 2 s102 and s103 do not exist on VI.
+ // + 2 for vcc
+ // + 2 for xnack_mask
+ // + 2 for flat_scratch
+ // + 4 for registers reserved for scratch resource register
+ // + 1 for register reserved for scratch wave offset. (By exluding this
+ // register from the list to consider, it means that when this
+ // register is being used for the scratch wave offset and there
+ // are no other free SGPRs, then the value will stay in this register.
+ // ----
+ // 13
+ for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) {
+ // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+ // scratch descriptor, since we haven’t added its uses yet.
+ if (!MRI.isPhysRegUsed(Reg)) {
+ assert(MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+ MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+ ScratchWaveOffsetReg = Reg;
+ MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+ break;
+ }
}
}
}
assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
- MachineBasicBlock::iterator I = MBB.begin();
DebugLoc DL;
if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);
setTargetDAGCombine(ISD::UMAX);
- setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);
// First check if it's a PS input addr
if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
- !Arg.Flags.isByVal()) {
+ !Arg.Flags.isByVal() && PSInputNum <= 15) {
- assert((PSInputNum <= 15) && "Too many PS inputs!");
-
- if (!Arg.Used) {
+ if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
// We can safely skip PS inputs
Skipped.set(i);
++PSInputNum;
continue;
}
- Info->PSInputAddr |= 1 << PSInputNum++;
+ Info->markPSInputAllocated(PSInputNum);
+ if (Arg.Used)
+ Info->PSInputEna |= 1 << PSInputNum;
+
+ ++PSInputNum;
}
// Second split vertices into their elements
*DAG.getContext());
// At least one interpolation mode must be enabled or else the GPU will hang.
+ //
+ // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
+ // PSInputAddr, the user wants to enable some bits after the compilation
+ // based on run-time states. Since we can't know what the final PSInputEna
+ // will look like, so we shouldn't do anything here and the user should take
+ // responsibility for the correct programming.
+ //
+ // Otherwise, the following restrictions apply:
+ // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+ // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+ // enabled too.
if (Info->getShaderType() == ShaderType::PIXEL &&
- (Info->PSInputAddr & 0x7F) == 0) {
- Info->PSInputAddr |= 1;
+ ((Info->getPSInputAddr() & 0x7F) == 0 ||
+ ((Info->getPSInputAddr() & 0xF) == 0 &&
+ Info->isPSInputAllocated(11)))) {
CCInfo.AllocateReg(AMDGPU::VGPR0);
CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->PSInputEna |= 1;
}
if (Info->getShaderType() == ShaderType::COMPUTE) {
CCInfo.AllocateReg(InputPtrReg);
}
+ if (Info->hasFlatScratchInit()) {
+ unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+ MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(FlatScratchInitReg);
+ }
+
AnalyzeFormalArguments(CCInfo, Splits);
SmallVector<SDValue, 16> Chains;
// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.
-
bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+ // Record that we know we have non-spill stack objects so we don't need to
+ // check all stack objects later.
+ if (HasStackObjects)
+ Info->setHasNonSpillStackObjects(true);
if (ST.isAmdHsaOS()) {
// TODO: Assume we will spill without optimizations.
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
+SDValue SITargetLowering::LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDLoc DL, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (Info->getShaderType() == ShaderType::COMPUTE)
+ return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
+ OutVals, DL, DAG);
+
+ Info->setIfReturnsVoid(Outs.size() == 0);
+
+ SmallVector<ISD::OutputArg, 48> Splits;
+ SmallVector<SDValue, 48> SplitVals;
+
+ // Split vectors into their elements.
+ for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+ const ISD::OutputArg &Out = Outs[i];
+
+ if (Out.VT.isVector()) {
+ MVT VT = Out.VT.getVectorElementType();
+ ISD::OutputArg NewOut = Out;
+ NewOut.Flags.setSplit();
+ NewOut.VT = VT;
+
+ // We want the original number of vector elements here, e.g.
+ // three or five, not four or eight.
+ unsigned NumElements = Out.ArgVT.getVectorNumElements();
+
+ for (unsigned j = 0; j != NumElements; ++j) {
+ SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
+ DAG.getConstant(j, DL, MVT::i32));
+ SplitVals.push_back(Elem);
+ Splits.push_back(NewOut);
+ NewOut.PartOffset += NewOut.VT.getStoreSize();
+ }
+ } else {
+ SplitVals.push_back(OutVals[i]);
+ Splits.push_back(Out);
+ }
+ }
+
+ // CCValAssign - represent the assignment of the return value to a location.
+ SmallVector<CCValAssign, 48> RVLocs;
+
+ // CCState - Info about the registers and stack slots.
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+
+ // Analyze outgoing return values.
+ AnalyzeReturn(CCInfo, Splits);
+
+ SDValue Flag;
+ SmallVector<SDValue, 48> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0, realRVLocIdx = 0;
+ i != RVLocs.size();
+ ++i, ++realRVLocIdx) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+
+ SDValue Arg = SplitVals[realRVLocIdx];
+
+ // Copied from other backends.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ }
+
+ // Update chain and glue.
+ RetOps[0] = Chain;
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+}
+
MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
switch (IntrinsicID) {
case Intrinsic::amdgcn_dispatch_ptr:
+ if (!Subtarget->isAmdHsaOS()) {
+ DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(),
+ "hsa intrinsic without hsa target");
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+ }
+
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
case ISD::UINT_TO_FP: {
return performUCharToFloatCombine(N, DCI);
-
+ }
case ISD::FADD: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
break;
break;
}
- }
case ISD::LOAD:
case ISD::STORE:
case ISD::ATOMIC_LOAD:
SDLoc DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ SDLoc DL, SelectionDAG &DAG) const override;
+
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
MachineBasicBlock * BB) const override;
bool enableAggressiveFMAFusion(EVT VT) const override;
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#define DEBUG_TYPE "si-insert-waits"
+
using namespace llvm;
namespace {
class SIInsertWaits : public MachineFunctionPass {
private:
- static char ID;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const MachineRegisterInfo *MRI;
bool LastInstWritesM0;
+ /// \brief Whether the machine function returns void
+ bool ReturnsVoid;
+
/// \brief Get increment/decrement amount for this instruction.
Counters getHwCounts(MachineInstr &MI);
void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
public:
- SIInsertWaits(TargetMachine &tm) :
+ static char ID;
+
+ SIInsertWaits() :
MachineFunctionPass(ID),
TII(nullptr),
TRI(nullptr),
} // End anonymous namespace
+INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
+ "SI Insert Waits", false, false)
+INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
+ "SI Insert Waits", false, false)
+
char SIInsertWaits::ID = 0;
-const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
-const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
-FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
- return new SIInsertWaits(tm);
+FunctionPass *llvm::createSIInsertWaitsPass() {
+ return new SIInsertWaits();
}
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } };
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+
Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
uint64_t TSFlags = MI.getDesc().TSFlags;
Counters Result = { { 0, 0, 0 } };
const Counters &Required) {
// End of program? No need to wait on anything
- if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+ // A function not returning void needs to wait, because other bytecode will
+ // be appended after it and we don't know what it will be.
+ if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
return false;
// Figure out if the async instructions execute in order
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
.addImm((Counts.Named.VM & 0xF) |
((Counts.Named.EXP & 0x7) << 4) |
- ((Counts.Named.LGKM & 0x7) << 8));
+ ((Counts.Named.LGKM & 0xF) << 8));
LastOpcodeType = OTHER;
LastInstWritesM0 = false;
TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI =
static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
-
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
MRI = &MF.getRegInfo();
WaitedOn = ZeroCounts;
LastIssued = ZeroCounts;
LastOpcodeType = OTHER;
LastInstWritesM0 = false;
+ ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();
memset(&UsedRegs, 0, sizeof(UsedRegs));
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
+ // Insert required wait states for SMRD reading an SGPR written by a VALU
+ // instruction.
+ if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+ I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
+ TII->insertWaitStates(std::next(I), 4);
+
// Wait for everything before a barrier.
if (I->getOpcode() == AMDGPU::S_BARRIER)
Changes |= insertWait(MBB, I, LastIssued);
let MIMG = 1;
let Uses = [EXEC];
+ let UseNamedOperandTable = 1;
let hasSideEffects = 0; // XXX ????
}
return AMDGPU::SI_SPILL_V32_SAVE;
case 8:
return AMDGPU::SI_SPILL_V64_SAVE;
+ case 12:
+ return AMDGPU::SI_SPILL_V96_SAVE;
case 16:
return AMDGPU::SI_SPILL_V128_SAVE;
case 32:
.addFrameIndex(FrameIndex) // frame_idx
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addImm(0) // offset
.addMemOperand(MMO);
}
return AMDGPU::SI_SPILL_V32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_V64_RESTORE;
+ case 12:
+ return AMDGPU::SI_SPILL_V96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_V128_RESTORE;
case 32:
.addFrameIndex(FrameIndex) // frame_idx
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addImm(0) // offset
.addMemOperand(MMO);
}
MachineOperand &Src1 = MI->getOperand(Src1Idx);
- if (isVOP2(*MI)) {
+ if (isVOP2(*MI) || isVOPC(*MI)) {
const MCInstrDesc &InstrDesc = MI->getDesc();
- // For VOP2 instructions, any operand type is valid to use for src0. Make
- // sure we can use the src1 as src0.
+ // For VOP2 and VOPC instructions, any operand type is valid to use for
+ // src0. Make sure we can use the src0 as src1.
//
// We could be stricter here and only allow commuting if there is a reason
// to do so. i.e. if both operands are VGPRs there is no real benefit,
case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
- case AMDGPU::S_LOAD_DWORD_IMM:
- case AMDGPU::S_LOAD_DWORD_SGPR:
- case AMDGPU::S_LOAD_DWORD_IMM_ci:
- return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
- case AMDGPU::S_LOAD_DWORDX2_IMM:
- case AMDGPU::S_LOAD_DWORDX2_SGPR:
- case AMDGPU::S_LOAD_DWORDX2_IMM_ci:
- return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
- case AMDGPU::S_LOAD_DWORDX4_IMM:
- case AMDGPU::S_LOAD_DWORDX4_SGPR:
- case AMDGPU::S_LOAD_DWORDX4_IMM_ci:
- return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
MRI.getRegClass(Reg) :
RI.getPhysRegClass(Reg);
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+ RC = TRI->getSubRegClass(RC, MO.getSubReg());
+
// In order to be legal, the common sub-class must be equal to the
// class of the current operand. For example:
//
bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
const MachineOperand *MO) const {
const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
- const MCInstrDesc &InstDesc = get(MI->getOpcode());
+ const MCInstrDesc &InstDesc = MI->getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
const TargetRegisterClass *DefinedRC =
OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
if (isVALU(*MI) &&
usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
- unsigned SGPRUsed =
- MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
+
+ RegSubRegPair SGPRUsed;
+ if (MO->isReg())
+ SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
+
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
if (i == OpIdx)
continue;
const MachineOperand &Op = MI->getOperand(i);
- if (Op.isReg() && Op.getReg() != SGPRUsed &&
+ if (Op.isReg() &&
+ (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
return false;
}
}
}
+unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
+ MachineRegisterInfo &MRI) const {
+ const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
+ unsigned DstReg = MRI.createVirtualRegister(SRC);
+ unsigned SubRegs = VRC->getSize() / 4;
+
+ SmallVector<unsigned, 8> SRegs;
+ for (unsigned i = 0; i < SubRegs; ++i) {
+ unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
+ get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
+ .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+ SRegs.push_back(SGPR);
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI,
+ UseMI->getDebugLoc(),
+ get(AMDGPU::REG_SEQUENCE), DstReg);
+ for (unsigned i = 0; i < SubRegs; ++i) {
+ MIB.addReg(SRegs[i]);
+ MIB.addImm(RI.getSubRegFromChannel(i));
+ }
+ return DstReg;
+}
+
+void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
+ MachineInstr *MI) const {
+
+ // If the pointer is store in VGPRs, then we need to move them to
+ // SGPRs using v_readfirstlane. This is safe because we only select
+ // loads with uniform pointers to SMRD instruction so we know the
+ // pointer value is uniform.
+ MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
+ if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
+ unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
+ SBase->setReg(SGPR);
+ }
+}
+
void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
return;
}
+ // Legalize SMRD
+ if (isSMRD(*MI)) {
+ legalizeOperandsSMRD(MRI, MI);
+ return;
+ }
+
// Legalize REG_SEQUENCE and PHI
// The register class of the operands much be the same type as the register
// class of the output.
return;
}
+ // Legalize MIMG
+ if (isMIMG(*MI)) {
+ MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
+ if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
+ unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
+ SRsrc->setReg(SGPR);
+ }
+
+ MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp);
+ if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
+ unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
+ SSamp->setReg(SGPR);
+ }
+ return;
+ }
+
// Legalize MUBUF* instructions
// FIXME: If we start using the non-addr64 instructions for compute, we
// may need to legalize them here.
}
}
-void SIInstrInfo::splitSMRD(MachineInstr *MI,
- const TargetRegisterClass *HalfRC,
- unsigned HalfImmOp, unsigned HalfSGPROp,
- MachineInstr *&Lo, MachineInstr *&Hi) const {
-
- DebugLoc DL = MI->getDebugLoc();
- MachineBasicBlock *MBB = MI->getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned RegLo = MRI.createVirtualRegister(HalfRC);
- unsigned RegHi = MRI.createVirtualRegister(HalfRC);
- unsigned HalfSize = HalfRC->getSize();
- const MachineOperand *OffOp =
- getNamedOperand(*MI, AMDGPU::OpName::offset);
- const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
-
- // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes
- // on VI.
-
- bool IsKill = SBase->isKill();
- if (OffOp) {
- bool isVI =
- MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >=
- AMDGPUSubtarget::VOLCANIC_ISLANDS;
- unsigned OffScale = isVI ? 1 : 4;
- // Handle the _IMM variant
- unsigned LoOffset = OffOp->getImm() * OffScale;
- unsigned HiOffset = LoOffset + HalfSize;
- Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo)
- // Use addReg instead of addOperand
- // to make sure kill flag is cleared.
- .addReg(SBase->getReg(), 0, SBase->getSubReg())
- .addImm(LoOffset / OffScale);
-
- if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) {
- unsigned OffsetSGPR =
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR)
- .addImm(HiOffset); // The offset in register is in bytes.
- Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
- .addReg(SBase->getReg(), getKillRegState(IsKill),
- SBase->getSubReg())
- .addReg(OffsetSGPR);
- } else {
- Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi)
- .addReg(SBase->getReg(), getKillRegState(IsKill),
- SBase->getSubReg())
- .addImm(HiOffset / OffScale);
- }
- } else {
- // Handle the _SGPR variant
- MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff);
- Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo)
- .addReg(SBase->getReg(), 0, SBase->getSubReg())
- .addOperand(*SOff);
- unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR)
- .addReg(SOff->getReg(), 0, SOff->getSubReg())
- .addImm(HalfSize);
- Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi)
- .addReg(SBase->getReg(), getKillRegState(IsKill),
- SBase->getSubReg())
- .addReg(OffsetSGPR);
- }
-
- unsigned SubLo, SubHi;
- const TargetRegisterClass *NewDstRC;
- switch (HalfSize) {
- case 4:
- SubLo = AMDGPU::sub0;
- SubHi = AMDGPU::sub1;
- NewDstRC = &AMDGPU::VReg_64RegClass;
- break;
- case 8:
- SubLo = AMDGPU::sub0_sub1;
- SubHi = AMDGPU::sub2_sub3;
- NewDstRC = &AMDGPU::VReg_128RegClass;
- break;
- case 16:
- SubLo = AMDGPU::sub0_sub1_sub2_sub3;
- SubHi = AMDGPU::sub4_sub5_sub6_sub7;
- NewDstRC = &AMDGPU::VReg_256RegClass;
- break;
- case 32:
- SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
- SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15;
- NewDstRC = &AMDGPU::VReg_512RegClass;
- break;
- default:
- llvm_unreachable("Unhandled HalfSize");
- }
-
- unsigned OldDst = MI->getOperand(0).getReg();
- unsigned NewDst = MRI.createVirtualRegister(NewDstRC);
-
- MRI.replaceRegWith(OldDst, NewDst);
-
- BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst)
- .addReg(RegLo)
- .addImm(SubLo)
- .addReg(RegHi)
- .addImm(SubHi);
-}
-
-void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI,
- MachineRegisterInfo &MRI,
- SmallVectorImpl<MachineInstr *> &Worklist) const {
- MachineBasicBlock *MBB = MI->getParent();
- int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
- assert(DstIdx != -1);
- unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass;
- switch(RI.getRegClass(DstRCID)->getSize()) {
- case 4:
- case 8:
- case 16: {
- unsigned NewOpcode = getVALUOp(*MI);
- unsigned RegOffset;
- unsigned ImmOffset;
-
- if (MI->getOperand(2).isReg()) {
- RegOffset = MI->getOperand(2).getReg();
- ImmOffset = 0;
- } else {
- assert(MI->getOperand(2).isImm());
- // SMRD instructions take a dword offsets on SI and byte offset on VI
- // and MUBUF instructions always take a byte offset.
- ImmOffset = MI->getOperand(2).getImm();
- if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <=
- AMDGPUSubtarget::SEA_ISLANDS)
- ImmOffset <<= 2;
- RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
-
- if (isUInt<12>(ImmOffset)) {
- BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
- RegOffset)
- .addImm(0);
- } else {
- BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
- RegOffset)
- .addImm(ImmOffset);
- ImmOffset = 0;
- }
- }
-
- unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
- unsigned DWord0 = RegOffset;
- unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
-
- BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
- .addImm(0);
- BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
- .addImm(RsrcDataFormat & 0xFFFFFFFF);
- BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
- .addImm(RsrcDataFormat >> 32);
- BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
- .addReg(DWord0)
- .addImm(AMDGPU::sub0)
- .addReg(DWord1)
- .addImm(AMDGPU::sub1)
- .addReg(DWord2)
- .addImm(AMDGPU::sub2)
- .addReg(DWord3)
- .addImm(AMDGPU::sub3);
-
- const MCInstrDesc &NewInstDesc = get(NewOpcode);
- const TargetRegisterClass *NewDstRC
- = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass);
- unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
- unsigned DstReg = MI->getOperand(0).getReg();
- MRI.replaceRegWith(DstReg, NewDstReg);
-
- MachineInstr *NewInst =
- BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg)
- .addOperand(MI->getOperand(1)) // sbase
- .addReg(SRsrc)
- .addImm(0)
- .addImm(ImmOffset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
- MI->eraseFromParent();
-
- legalizeOperands(NewInst);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- break;
- }
- case 32: {
- MachineInstr *Lo, *Hi;
- splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM,
- AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi);
- MI->eraseFromParent();
- moveSMRDToVALU(Lo, MRI, Worklist);
- moveSMRDToVALU(Hi, MRI, Worklist);
- break;
- }
-
- case 64: {
- MachineInstr *Lo, *Hi;
- splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM,
- AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi);
- MI->eraseFromParent();
- moveSMRDToVALU(Lo, MRI, Worklist);
- moveSMRDToVALU(Hi, MRI, Worklist);
- break;
- }
- }
-}
-
void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
SmallVector<MachineInstr *, 128> Worklist;
Worklist.push_back(&TopInst);
// Handle some special cases
switch (Opcode) {
default:
- if (isSMRD(*Inst)) {
- moveSMRDToVALU(Inst, MRI, Worklist);
- continue;
- }
break;
case AMDGPU::S_AND_B64:
splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
AMDGPU::RSRC_TID_ENABLE |
0xffffffff; // Size;
+ uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+
+ Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT);
+
// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
// Clear them unless we want a huge stride.
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return Rsrc23;
}
+
+bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
+ unsigned Opc = MI->getOpcode();
+
+ return isSMRD(Opc);
+}
+
+bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
+ unsigned Opc = MI->getOpcode();
+
+ return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+}
/// \brief Fix operands in \p MI to satisfy constant bus requirements.
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+ /// Copy a value from a VGPR (\p SrcReg) to SGPR. This function can only
+ /// be used when it is know that the value in SrcReg is same across all
+ /// threads in the wave.
+ /// \returns The SGPR register that \p SrcReg was copied to.
+ unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
+ MachineRegisterInfo &MRI) const;
+
+ void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr *MI) const;
+
/// \brief Legalize all operands in this instruction. This function may
/// create new instruction and insert them before \p MI.
void legalizeOperands(MachineInstr *MI) const;
- /// \brief Split an SMRD instruction into two smaller loads of half the
- // size storing the results in \p Lo and \p Hi.
- void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC,
- unsigned HalfImmOp, unsigned HalfSGPROp,
- MachineInstr *&Lo, MachineInstr *&Hi) const;
-
- void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI,
- SmallVectorImpl<MachineInstr *> &Worklist) const;
-
/// \brief Replace this instruction's opcode with the equivalent VALU
/// opcode. This function will also move the users of \p MI to the
/// VALU if necessary.
uint64_t getDefaultRsrcDataFormat() const;
uint64_t getScratchRsrcWords23() const;
+
+ bool isLowLatencyInstruction(const MachineInstr *MI) const;
+ bool isHighLatencyInstruction(const MachineInstr *MI) const;
};
namespace AMDGPU {
const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
const uint64_t RSRC_TID_ENABLE = 1LL << 55;
-
+ const uint64_t RSRC_ELEMENT_SIZE_SHIFT = 51;
} // End namespace AMDGPU
namespace SI {
def isCIOnly : Predicate<"Subtarget->getGeneration() =="
"AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate <"FeatureSeaIslands">;
-def isVI : Predicate <
- "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
- AssemblerPredicate<"FeatureGCN3Encoding">;
def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
SDTCisVT<0, i64>]>
>;
+//===----------------------------------------------------------------------===//
+// PatFrags for FLAT instructions
+//===----------------------------------------------------------------------===//
+
+class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
+ (ld node:$ptr), [{
+ return isFlatLoad(dyn_cast<LoadSDNode>(N)) ||
+ isGlobalLoad(dyn_cast<LoadSDNode>(N)) ||
+ isConstantLoad(cast<LoadSDNode>(N), -1);
+}]>;
+
+def flat_load : flat_ld <load>;
+def flat_az_extloadi8 : flat_ld <az_extloadi8>;
+def flat_sextloadi8 : flat_ld <sextloadi8>;
+def flat_az_extloadi16 : flat_ld <az_extloadi16>;
+def flat_sextloadi16 : flat_ld <sextloadi16>;
+
+class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ return isFlatStore(dyn_cast<StoreSDNode>(N)) ||
+ isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def flat_store: flat_st <store>;
+def flat_truncstorei8 : flat_st <truncstorei8>;
+def flat_truncstorei16 : flat_st <truncstorei16>;
+
+
def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
return isGlobalLoad(cast<LoadSDNode>(N)) ||
isConstantLoad(cast<LoadSDNode>(N), -1);
list<SchedReadWrite> sched,
string revOpName = "", string asm = opName#"_e32 "#op_asm,
string alias_asm = opName#" "#op_asm> {
- def "" : VOPC_Pseudo <ins, pattern, opName> {
+ def "" : VOPC_Pseudo <ins, pattern, opName>,
+ VOP2_REV<revOpName#"_e32", !eq(revOpName, opName)> {
let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
let SchedRW = sched;
}
multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32,
list<dag> pat64, bit DefExec, string revOp,
VOPProfile p, list<SchedReadWrite> sched> {
- defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>;
+ defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched,
+ revOp>;
defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64,
opName, p.HasModifiers, DefExec, revOp, sched>;
} // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = ""
}
-class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
- FLAT <op, (outs regClass:$vdst),
- (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
- asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> {
- let data = 0;
- let mayLoad = 1;
+//===----------------------------------------------------------------------===//
+// FLAT classes
+//===----------------------------------------------------------------------===//
+
+class flat <bits<7> ci, bits<7> vi = ci> {
+ field bits<7> CI = ci;
+ field bits<7> VI = vi;
}
-class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
- FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr,
- glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
- name#" $data, $addr"#"$glc"#"$slc"#"$tfe",
- []> {
+class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> :
+ FLAT <0, outs, ins, "", pattern>,
+ SIMCInstr<opName, SISubtarget.NONE> {
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+}
- let mayLoad = 0;
- let mayStore = 1;
+class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> :
+ FLAT <op, outs, ins, asm, []>,
+ SIMCInstr<opName, SISubtarget.SI> {
+ let AssemblerPredicate = isCIOnly;
+}
- // Encoding
- let vdst = 0;
+class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> :
+ FLAT <op, outs, ins, asm, []>,
+ SIMCInstr<opName, SISubtarget.VI> {
+ let AssemblerPredicate = VIAssemblerPredicate;
}
-multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc,
- RegisterClass data_rc = vdst_rc> {
+multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm,
+ list<dag> pattern> {
+ def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>,
+ AtomicNoRet <NAME, 1>;
- let mayLoad = 1, mayStore = 1 in {
- def "" : FLAT <op, (outs),
- (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
- tfe_flat_atomic:$tfe),
- name#" $addr, $data"#"$slc"#"$tfe", []>,
- AtomicNoRet <NAME, 0> {
- let glc = 0;
- let vdst = 0;
- }
+ def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>;
- def _RTN : FLAT <op, (outs vdst_rc:$vdst),
- (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
- tfe_flat_atomic:$tfe),
- name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>,
- AtomicNoRet <NAME, 1> {
- let glc = 1;
- let hasPostISelHook = 1;
- }
+ def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>;
+}
+
+multiclass FLAT_Load_Helper <flat op, string asm_name,
+ RegisterClass regClass,
+ dag outs = (outs regClass:$vdst),
+ dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe),
+ string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+ let data = 0, mayLoad = 1 in {
+
+ def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+ }
+}
+
+multiclass FLAT_Store_Helper <flat op, string asm_name,
+ RegisterClass vdataClass,
+ dag outs = (outs),
+ dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc,
+ slc_flat:$slc, tfe_flat:$tfe),
+ string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> {
+
+ let mayLoad = 0, mayStore = 1, vdst = 0 in {
+
+ def "" : FLAT_Pseudo <NAME, outs, ins, []>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>;
+ }
+}
+
+multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc,
+ RegisterClass data_rc = vdst_rc,
+ dag outs_noret = (outs),
+ string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> {
+
+ let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in {
+ def "" : FLAT_Pseudo <NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>,
+ AtomicNoRet <NAME, 0>;
+
+ def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+ asm_noret>;
+
+ def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret,
+ (ins VReg_64:$addr, data_rc:$data,
+ slc_flat_atomic:$slc, tfe_flat_atomic:$tfe),
+ asm_noret>;
+ }
+
+ let glc = 1, hasPostISelHook = 1 in {
+ defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst),
+ (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc,
+ tfe_flat_atomic:$tfe),
+ asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>;
}
}
// SMRD Instructions
//===----------------------------------------------------------------------===//
-let mayLoad = 1 in {
-
// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
// SMRD instructions, because the SGPR_32 register class does not include M0
// and writing to M0 from an SMRD instruction will hang the GPU.
smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
>;
-} // mayLoad = 1
-
//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;
defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>;
defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32",
- [(set i32:$dst, (ctlz_zero_undef i32:$src0))]
+ [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))]
>;
defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>;
def _SAVE : InstSI <
(outs),
(ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
- SReg_32:$scratch_offset),
- "", []
- > {
+ SReg_32:$scratch_offset, i32imm:$offset),
+ "", []> {
let mayStore = 1;
let mayLoad = 0;
}
def _RESTORE : InstSI <
(outs vgpr_class:$dst),
- (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset),
- "", []
- > {
+ (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,
+ i32imm:$offset),
+ "", []> {
let mayStore = 0;
let mayLoad = 1;
}
llvm_i32_ty], // tfe(imm)
[IntrReadArgMem]>;
- def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+ def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
// Fully-flexible SAMPLE instruction.
class SampleRaw : Intrinsic <
using namespace llvm;
-namespace {
+#define DEBUG_TYPE "si-lower-control-flow"
-class SILowerControlFlowPass : public MachineFunctionPass {
+namespace {
+class SILowerControlFlow : public MachineFunctionPass {
private:
static const unsigned SkipThreshold = 12;
- static char ID;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
void IndirectDst(MachineInstr &MI);
public:
- SILowerControlFlowPass(TargetMachine &tm) :
+ static char ID;
+
+ SILowerControlFlow() :
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
- return "SI Lower control flow instructions";
+ return "SI Lower control flow pseudo instructions";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
} // End anonymous namespace
-char SILowerControlFlowPass::ID = 0;
+char SILowerControlFlow::ID = 0;
+
+INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
+ "SI lower control flow", false, false)
-FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
- return new SILowerControlFlowPass(tm);
+char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+
+
+FunctionPass *llvm::createSILowerControlFlowPass() {
+ return new SILowerControlFlow();
}
-bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
- MachineBasicBlock *To) {
+bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
+ MachineBasicBlock *To) {
unsigned NumInstr = 0;
return false;
}
-void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
return;
.addOperand(To);
}
-void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
+void SILowerControlFlow::SkipIfDead(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
}
-void SILowerControlFlowPass::If(MachineInstr &MI) {
+void SILowerControlFlow::If(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
MI.eraseFromParent();
}
-void SILowerControlFlowPass::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
MI.eraseFromParent();
}
-void SILowerControlFlowPass::Break(MachineInstr &MI) {
+void SILowerControlFlow::Break(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
MI.eraseFromParent();
}
-void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+void SILowerControlFlow::IfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
MI.eraseFromParent();
}
-void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
MI.eraseFromParent();
}
-void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+void SILowerControlFlow::Loop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Src = MI.getOperand(0).getReg();
MI.eraseFromParent();
}
-void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+void SILowerControlFlow::EndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
MI.eraseFromParent();
}
-void SILowerControlFlowPass::Branch(MachineInstr &MI) {
+void SILowerControlFlow::Branch(MachineInstr &MI) {
if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
MI.eraseFromParent();
// If these aren't equal, this is probably an infinite loop.
}
-void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+void SILowerControlFlow::Kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Op = MI.getOperand(0);
MI.eraseFromParent();
}
-void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+void SILowerControlFlow::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
// indirect Index. e.g. v0 = v[VecReg + Offset]
// As an output, this is a constant value that needs
// to be added to the value stored in M0.
-void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
- unsigned &Reg,
- int &Offset) {
+void SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg,
+ unsigned &Reg,
+ int &Offset) {
unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
if (!SubReg)
SubReg = VecReg;
Reg = RC->getRegister(RegIdx);
}
-void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
+void SILowerControlFlow::IndirectSrc(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
LoadM0(MI, MovRel, Off);
}
-void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
+void SILowerControlFlow::IndirectDst(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
LoadM0(MI, MovRel, Off);
}
-bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
+bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI =
static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
+ MachineBasicBlock *EmptyMBBAtEnd = NULL;
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
case AMDGPU::SI_INDIRECT_DST_V16:
IndirectDst(MI);
break;
+
+ case AMDGPU::S_ENDPGM: {
+ if (MF.getInfo<SIMachineFunctionInfo>()->returnsVoid())
+ break;
+
+ // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+ // because external bytecode will be appended at the end.
+ if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+ // S_ENDPGM is not the last instruction. Add an empty block at
+ // the end and jump there.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB.addSuccessor(EmptyMBBAtEnd);
+ BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ }
+
+ I->eraseFromParent();
+ break;
+ }
}
}
}
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
}
- // FIXME: This seems inappropriate to do here.
if (NeedFlat && MFI->IsKernel) {
- // Insert the prologue initializing the SGPRs pointing to the scratch space
- // for flat accesses.
- const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-
// TODO: What to use with function calls?
-
- // FIXME: This is reporting stack size that is used in a scratch buffer
- // rather than registers as well.
- uint64_t StackSizeBytes = FrameInfo->getStackSize();
-
- int IndirectBegin
- = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
- // Convert register index to 256-byte unit.
- uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
-
- assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
- "Stack limits should be smaller than 16-bits");
-
- // Initialize the flat scratch register pair.
- // TODO: Can we use one s_mov_b64 here?
-
- // Offset is in units of 256-bytes.
- MachineBasicBlock &MBB = MF.front();
- DebugLoc NoDL;
- MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
- const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
-
- assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
-
- BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
- .addImm(StackOffset);
-
- // Documentation says size is "per-thread scratch size in bytes"
- BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
- .addImm(StackSizeBytes);
+ // We will need to Initialize the flat scratch register pair.
+ if (NeedFlat)
+ MFI->setHasFlatInstructions(true);
}
return true;
WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
- LDSWaveSpillSize(0),
PSInputAddr(0),
+ ReturnsVoid(true),
+ LDSWaveSpillSize(0),
+ PSInputEna(0),
NumUserSGPRs(0),
NumSystemSGPRs(0),
HasSpilledSGPRs(false),
HasSpilledVGPRs(false),
+ HasNonSpillStackObjects(false),
+ HasFlatInstructions(false),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
const Function *F = MF.getFunction();
+ PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
if (getShaderType() == ShaderType::COMPUTE)
if (F->hasFnAttribute("amdgpu-work-item-id-z"))
WorkItemIDZ = true;
+ // X, XY, and XYZ are the only supported combinations, so make sure Y is
+ // enabled if Z is.
+ if (WorkItemIDZ)
+ WorkItemIDY = true;
+
bool MaySpill = ST.isVGPRSpillingEnabled(this);
bool HasStackObjects = FrameInfo->hasStackObjects();
DispatchPtr = true;
}
- // X, XY, and XYZ are the only supported combinations, so make sure Y is
- // enabled if Z is.
- if (WorkItemIDZ)
- WorkItemIDY = true;
+ // We don't need to worry about accessing spills with flat instructions.
+ // TODO: On VI where we must use flat for global, we should be able to omit
+ // this if it is never used for generic access.
+ if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
+ ST.isAmdHsaOS())
+ FlatScratchInit = true;
}
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
return KernargSegmentPtrUserSGPR;
}
+unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
+ FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return FlatScratchInitUserSGPR;
+}
+
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
MachineFunction *MF,
unsigned FrameIndex,
unsigned SubIdx) {
- const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+ MachineFrameInfo *FrameInfo = MF->getFrameInfo();
const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo());
MachineRegisterInfo &MRI = MF->getRegInfo();
unsigned Lane = (Offset / 4) % 64;
struct SpilledReg Spill;
+ Spill.Lane = Lane;
if (!LaneVGPRs.count(LaneVGPRIdx)) {
unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+
+ if (LaneVGPR == AMDGPU::NoRegister)
+ // We have no VGPRs left for spilling SGPRs.
+ return Spill;
+
+
LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
// Add this register as live-in to all blocks to avoid machine verifer
}
Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
- Spill.Lane = Lane;
return Spill;
}
unsigned WorkGroupInfoSystemSGPR;
unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
+ // Graphics info.
+ unsigned PSInputAddr;
+ bool ReturnsVoid;
+
public:
// FIXME: Make private
unsigned LDSWaveSpillSize;
- unsigned PSInputAddr;
+ unsigned PSInputEna;
std::map<unsigned, unsigned> LaneVGPRs;
unsigned ScratchOffsetReg;
unsigned NumUserSGPRs;
private:
bool HasSpilledSGPRs;
bool HasSpilledVGPRs;
+ bool HasNonSpillStackObjects;
+ bool HasFlatInstructions;
// Feature bits required for inputs passed in user SGPRs.
bool PrivateSegmentBuffer : 1;
unsigned VGPR;
int Lane;
SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
- SpilledReg() : VGPR(0), Lane(-1) { }
+ SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { }
bool hasLane() { return Lane != -1;}
+ bool hasReg() { return VGPR != AMDGPU::NoRegister;}
};
// SIMachineFunctionInfo definition
unsigned addDispatchPtr(const SIRegisterInfo &TRI);
unsigned addQueuePtr(const SIRegisterInfo &TRI);
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+ unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
// Add system SGPRs.
unsigned addWorkGroupIDX() {
HasSpilledVGPRs = Spill;
}
+ bool hasNonSpillStackObjects() const {
+ return HasNonSpillStackObjects;
+ }
+
+ void setHasNonSpillStackObjects(bool StackObject = true) {
+ HasNonSpillStackObjects = StackObject;
+ }
+
+ bool hasFlatInstructions() const {
+ return HasFlatInstructions;
+ }
+
+ void setHasFlatInstructions(bool UseFlat = true) {
+ HasFlatInstructions = UseFlat;
+ }
+
+ unsigned getPSInputAddr() const {
+ return PSInputAddr;
+ }
+
+ bool isPSInputAllocated(unsigned Index) const {
+ return PSInputAddr & (1 << Index);
+ }
+
+ void markPSInputAllocated(unsigned Index) {
+ PSInputAddr |= 1 << Index;
+ }
+
+ bool returnsVoid() const {
+ return ReturnsVoid;
+ }
+
+ void setIfReturnsVoid(bool Value) {
+ ReturnsVoid = Value;
+ }
+
unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
};
--- /dev/null
+//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIMachineScheduler.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+// This scheduler implements a different scheduling algorithm than
+// GenericScheduler.
+//
+// There are several specific architecture behaviours that can't be modelled
+// for GenericScheduler:
+// . When accessing the result of an SGPR load instruction, you have to wait
+// for all the SGPR load instructions before your current instruction to
+// have finished.
+// . When accessing the result of an VGPR load instruction, you have to wait
+// for all the VGPR load instructions previous to the VGPR load instruction
+// you are interested in to finish.
+// . The less the register pressure, the best load latencies are hidden
+//
+// Moreover some specifities (like the fact a lot of instructions in the shader
+// have few dependencies) makes the generic scheduler have some unpredictable
+// behaviours. For example when register pressure becomes high, it can either
+// manage to prevent register pressure from going too high, or it can
+// increase register pressure even more than if it hadn't taken register
+// pressure into account.
+//
+// Also some other bad behaviours are generated, like loading at the beginning
+// of the shader a constant in VGPR you won't need until the end of the shader.
+//
+// The scheduling problem for SI can distinguish three main parts:
+// . Hiding high latencies (texture sampling, etc)
+// . Hiding low latencies (SGPR constant loading, etc)
+// . Keeping register usage low for better latency hiding and general
+// performance
+//
+// Some other things can also affect performance, but are hard to predict
+// (cache usage, the fact the HW can issue several instructions from different
+// wavefronts if different types, etc)
+//
+// This scheduler tries to solve the scheduling problem by dividing it into
+// simpler sub-problems. It divides the instructions into blocks, schedules
+// locally inside the blocks where it takes care of low latencies, and then
+// chooses the order of the blocks by taking care of high latencies.
+// Dividing the instructions into blocks helps control keeping register
+// usage low.
+//
+// First the instructions are put into blocks.
+// We want the blocks help control register usage and hide high latencies
+// later. To help control register usage, we typically want all local
+// computations, when for example you create a result that can be comsummed
+// right away, to be contained in a block. Block inputs and outputs would
+// typically be important results that are needed in several locations of
+// the shader. Since we do want blocks to help hide high latencies, we want
+// the instructions inside the block to have a minimal set of dependencies
+// on high latencies. It will make it easy to pick blocks to hide specific
+// high latencies.
+// The block creation algorithm is divided into several steps, and several
+// variants can be tried during the scheduling process.
+//
+// Second the order of the instructions inside the blocks is choosen.
+// At that step we do take into account only register usage and hiding
+// low latency instructions
+//
+// Third the block order is choosen, there we try to hide high latencies
+// and keep register usage low.
+//
+// After the third step, a pass is done to improve the hiding of low
+// latencies.
+//
+// Actually when talking about 'low latency' or 'high latency' it includes
+// both the latency to get the cache (or global mem) data go to the register,
+// and the bandwith limitations.
+// Increasing the number of active wavefronts helps hide the former, but it
+// doesn't solve the latter, thus why even if wavefront count is high, we have
+// to try have as many instructions hiding high latencies as possible.
+// The OpenCL doc says for example latency of 400 cycles for a global mem access,
+// which is hidden by 10 instructions if the wavefront count is 10.
+
+// Some figures taken from AMD docs:
+// Both texture and constant L1 caches are 4-way associative with 64 bytes
+// lines.
+// Constant cache is shared with 4 CUs.
+// For texture sampling, the address generation unit receives 4 texture
+// addresses per cycle, thus we could expect texture sampling latency to be
+// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
+// instructions in a wavefront group are executed every 4 cycles),
+// or 16 instructions if the other wavefronts associated to the 3 other VALUs
+// of the CU do texture sampling too. (Don't take these figures too seriously,
+// as I'm not 100% sure of the computation)
+// Data exports should get similar latency.
+// For constant loading, the cache is shader with 4 CUs.
+// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
+// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
+// It means a simple s_buffer_load should take one instruction to hide, as
+// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
+// cache line.
+//
+// As of today the driver doesn't preload the constants in cache, thus the
+// first loads get extra latency. The doc says global memory access can be
+// 300-600 cycles. We do not specially take that into account when scheduling
+// As we expect the driver to be able to preload the constants soon.
+
+
+// common code //
+
+#ifndef NDEBUG
+
+static const char *getReasonStr(SIScheduleCandReason Reason) {
+ switch (Reason) {
+ case NoCand: return "NOCAND";
+ case RegUsage: return "REGUSAGE";
+ case Latency: return "LATENCY";
+ case Successor: return "SUCCESSOR";
+ case Depth: return "DEPTH";
+ case NodeOrder: return "ORDER";
+ }
+ llvm_unreachable("Unknown reason!");
+}
+
+#endif
+
+static bool tryLess(int TryVal, int CandVal,
+ SISchedulerCandidate &TryCand,
+ SISchedulerCandidate &Cand,
+ SIScheduleCandReason Reason) {
+ if (TryVal < CandVal) {
+ TryCand.Reason = Reason;
+ return true;
+ }
+ if (TryVal > CandVal) {
+ if (Cand.Reason > Reason)
+ Cand.Reason = Reason;
+ return true;
+ }
+ Cand.setRepeat(Reason);
+ return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+ SISchedulerCandidate &TryCand,
+ SISchedulerCandidate &Cand,
+ SIScheduleCandReason Reason) {
+ if (TryVal > CandVal) {
+ TryCand.Reason = Reason;
+ return true;
+ }
+ if (TryVal < CandVal) {
+ if (Cand.Reason > Reason)
+ Cand.Reason = Reason;
+ return true;
+ }
+ Cand.setRepeat(Reason);
+ return false;
+}
+
+// SIScheduleBlock //
+
+void SIScheduleBlock::addUnit(SUnit *SU) {
+ NodeNum2Index[SU->NodeNum] = SUnits.size();
+ SUnits.push_back(SU);
+}
+
+#ifndef NDEBUG
+
+void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
+
+ dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+ dbgs() << '\n';
+}
+#endif
+
+void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
+ SISchedCandidate &TryCand) {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return;
+ }
+
+ if (Cand.SGPRUsage > 60 &&
+ tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
+ return;
+
+ // Schedule low latency instructions as top as possible.
+ // Order of priority is:
+ // . Low latency instructions which do not depend on other low latency
+ // instructions we haven't waited for
+ // . Other instructions which do not depend on low latency instructions
+ // we haven't waited for
+ // . Low latencies
+ // . All other instructions
+ // Goal is to get: low latency instructions - independant instructions
+ // - (eventually some more low latency instructions)
+ // - instructions that depend on the first low latency instructions.
+ // If in the block there is a lot of constant loads, the SGPR usage
+ // could go quite high, thus above the arbitrary limit of 60 will encourage
+ // use the already loaded constants (in order to release some SGPRs) before
+ // loading more.
+ if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
+ Cand.HasLowLatencyNonWaitedParent,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (TryCand.IsLowLatency &&
+ tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
+ TryCand, Cand, SIScheduleCandReason::Depth))
+ return;
+
+ if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
+ return;
+
+ // Fall through to original instruction order.
+ if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+ TryCand.Reason = NodeOrder;
+ }
+}
+
+SUnit* SIScheduleBlock::pickNode() {
+ SISchedCandidate TopCand;
+
+ for (SUnit* SU : TopReadySUs) {
+ SISchedCandidate TryCand;
+ std::vector<unsigned> pressure;
+ std::vector<unsigned> MaxPressure;
+ // Predict register usage after this instruction.
+ TryCand.SU = SU;
+ TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
+ TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
+ TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+ TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
+ TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
+ TryCand.HasLowLatencyNonWaitedParent =
+ HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
+ tryCandidateTopDown(TopCand, TryCand);
+ if (TryCand.Reason != NoCand)
+ TopCand.setBest(TryCand);
+ }
+
+ return TopCand.SU;
+}
+
+
+// Schedule something valid.
+void SIScheduleBlock::fastSchedule() {
+ TopReadySUs.clear();
+ if (Scheduled)
+ undoSchedule();
+
+ for (SUnit* SU : SUnits) {
+ if (!SU->NumPredsLeft)
+ TopReadySUs.push_back(SU);
+ }
+
+ while (!TopReadySUs.empty()) {
+ SUnit *SU = TopReadySUs[0];
+ ScheduledSUnits.push_back(SU);
+ nodeScheduled(SU);
+ }
+
+ Scheduled = true;
+}
+
+// Returns if the register was set between first and last.
+static bool isDefBetween(unsigned Reg,
+ SlotIndex First, SlotIndex Last,
+ const MachineRegisterInfo *MRI,
+ const LiveIntervals *LIS) {
+ for (MachineRegisterInfo::def_instr_iterator
+ UI = MRI->def_instr_begin(Reg),
+ UE = MRI->def_instr_end(); UI != UE; ++UI) {
+ const MachineInstr* MI = &*UI;
+ if (MI->isDebugValue())
+ continue;
+ SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+ if (InstSlot >= First && InstSlot <= Last)
+ return true;
+ }
+ return false;
+}
+
+void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock) {
+ IntervalPressure Pressure, BotPressure;
+ RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
+ LiveIntervals *LIS = DAG->getLIS();
+ MachineRegisterInfo *MRI = DAG->getMRI();
+ DAG->initRPTracker(TopRPTracker);
+ DAG->initRPTracker(BotRPTracker);
+ DAG->initRPTracker(RPTracker);
+
+ // Goes though all SU. RPTracker captures what had to be alive for the SUs
+ // to execute, and what is still alive at the end.
+ for (SUnit* SU : ScheduledSUnits) {
+ RPTracker.setPos(SU->getInstr());
+ RPTracker.advance();
+ }
+
+ // Close the RPTracker to finalize live ins/outs.
+ RPTracker.closeRegion();
+
+ // Initialize the live ins and live outs.
+ TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+ BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+ // Do not Track Physical Registers, because it messes up.
+ for (unsigned Reg : RPTracker.getPressure().LiveInRegs) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg))
+ LiveInRegs.insert(Reg);
+ }
+ LiveOutRegs.clear();
+ // There is several possibilities to distinguish:
+ // 1) Reg is not input to any instruction in the block, but is output of one
+ // 2) 1) + read in the block and not needed after it
+ // 3) 1) + read in the block but needed in another block
+ // 4) Reg is input of an instruction but another block will read it too
+ // 5) Reg is input of an instruction and then rewritten in the block.
+ // result is not read in the block (implies used in another block)
+ // 6) Reg is input of an instruction and then rewritten in the block.
+ // result is read in the block and not needed in another block
+ // 7) Reg is input of an instruction and then rewritten in the block.
+ // result is read in the block but also needed in another block
+ // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
+ // We want LiveOutRegs to contain only Regs whose content will be read after
+ // in another block, and whose content was written in the current block,
+ // that is we want it to get 1, 3, 5, 7
+ // Since we made the MIs of a block to be packed all together before
+ // scheduling, then the LiveIntervals were correct, and the RPTracker was
+ // able to correctly handle 5 vs 6, 2 vs 3.
+ // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
+ // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
+ // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
+ // The use of findDefBetween removes the case 4.
+ for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) {
+ if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+ isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(),
+ LIS->getInstructionIndex(EndBlock).getRegSlot(),
+ MRI, LIS)) {
+ LiveOutRegs.insert(Reg);
+ }
+ }
+
+ // Pressure = sum_alive_registers register size
+ // Internally llvm will represent some registers as big 128 bits registers
+ // for example, but they actually correspond to 4 actual 32 bits registers.
+ // Thus Pressure is not equal to num_alive_registers * constant.
+ LiveInPressure = TopPressure.MaxSetPressure;
+ LiveOutPressure = BotPressure.MaxSetPressure;
+
+ // Prepares TopRPTracker for top down scheduling.
+ TopRPTracker.closeTop();
+}
+
+void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock) {
+ if (!Scheduled)
+ fastSchedule();
+
+ // PreScheduling phase to set LiveIn and LiveOut.
+ initRegPressure(BeginBlock, EndBlock);
+ undoSchedule();
+
+ // Schedule for real now.
+
+ TopReadySUs.clear();
+
+ for (SUnit* SU : SUnits) {
+ if (!SU->NumPredsLeft)
+ TopReadySUs.push_back(SU);
+ }
+
+ while (!TopReadySUs.empty()) {
+ SUnit *SU = pickNode();
+ ScheduledSUnits.push_back(SU);
+ TopRPTracker.setPos(SU->getInstr());
+ TopRPTracker.advance();
+ nodeScheduled(SU);
+ }
+
+ // TODO: compute InternalAdditionnalPressure.
+ InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
+
+ // Check everything is right.
+#ifndef NDEBUG
+ assert(SUnits.size() == ScheduledSUnits.size() &&
+ TopReadySUs.empty());
+ for (SUnit* SU : SUnits) {
+ assert(SU->isScheduled &&
+ SU->NumPredsLeft == 0);
+ }
+#endif
+
+ Scheduled = true;
+}
+
+void SIScheduleBlock::undoSchedule() {
+ for (SUnit* SU : SUnits) {
+ SU->isScheduled = false;
+ for (SDep& Succ : SU->Succs) {
+ if (BC->isSUInBlock(Succ.getSUnit(), ID))
+ undoReleaseSucc(SU, &Succ);
+ }
+ }
+ HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+ ScheduledSUnits.clear();
+ Scheduled = false;
+}
+
+void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+ SUnit *SuccSU = SuccEdge->getSUnit();
+
+ if (SuccEdge->isWeak()) {
+ ++SuccSU->WeakPredsLeft;
+ return;
+ }
+ ++SuccSU->NumPredsLeft;
+}
+
+void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+ SUnit *SuccSU = SuccEdge->getSUnit();
+
+ if (SuccEdge->isWeak()) {
+ --SuccSU->WeakPredsLeft;
+ return;
+ }
+#ifndef NDEBUG
+ if (SuccSU->NumPredsLeft == 0) {
+ dbgs() << "*** Scheduling failed! ***\n";
+ SuccSU->dump(DAG);
+ dbgs() << " has been released too many times!\n";
+ llvm_unreachable(nullptr);
+ }
+#endif
+
+ --SuccSU->NumPredsLeft;
+}
+
+/// Release Successors of the SU that are in the block or not.
+void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
+ for (SDep& Succ : SU->Succs) {
+ SUnit *SuccSU = Succ.getSUnit();
+
+ if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
+ continue;
+
+ releaseSucc(SU, &Succ);
+ if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
+ TopReadySUs.push_back(SuccSU);
+ }
+}
+
+void SIScheduleBlock::nodeScheduled(SUnit *SU) {
+ // Is in TopReadySUs
+ assert (!SU->NumPredsLeft);
+ std::vector<SUnit*>::iterator I =
+ std::find(TopReadySUs.begin(), TopReadySUs.end(), SU);
+ if (I == TopReadySUs.end()) {
+ dbgs() << "Data Structure Bug in SI Scheduler\n";
+ llvm_unreachable(nullptr);
+ }
+ TopReadySUs.erase(I);
+
+ releaseSuccessors(SU, true);
+ // Scheduling this node will trigger a wait,
+ // thus propagate to other instructions that they do not need to wait either.
+ if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
+ HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
+
+ if (DAG->IsLowLatencySU[SU->NodeNum]) {
+ for (SDep& Succ : SU->Succs) {
+ std::map<unsigned, unsigned>::iterator I =
+ NodeNum2Index.find(Succ.getSUnit()->NodeNum);
+ if (I != NodeNum2Index.end())
+ HasLowLatencyNonWaitedParent[I->second] = 1;
+ }
+ }
+ SU->isScheduled = true;
+}
+
+void SIScheduleBlock::finalizeUnits() {
+ // We remove links from outside blocks to enable scheduling inside the block.
+ for (SUnit* SU : SUnits) {
+ releaseSuccessors(SU, false);
+ if (DAG->IsHighLatencySU[SU->NodeNum])
+ HighLatencyBlock = true;
+ }
+ HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
+}
+
+// we maintain ascending order of IDs
+void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
+ unsigned PredID = Pred->getID();
+
+ // Check if not already predecessor.
+ for (SIScheduleBlock* P : Preds) {
+ if (PredID == P->getID())
+ return;
+ }
+ Preds.push_back(Pred);
+
+#ifndef NDEBUG
+ for (SIScheduleBlock* S : Succs) {
+ if (PredID == S->getID())
+ assert(!"Loop in the Block Graph!\n");
+ }
+#endif
+}
+
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+ unsigned SuccID = Succ->getID();
+
+ // Check if not already predecessor.
+ for (SIScheduleBlock* S : Succs) {
+ if (SuccID == S->getID())
+ return;
+ }
+ if (Succ->isHighLatencyBlock())
+ ++NumHighLatencySuccessors;
+ Succs.push_back(Succ);
+#ifndef NDEBUG
+ for (SIScheduleBlock* P : Preds) {
+ if (SuccID == P->getID())
+ assert("Loop in the Block Graph!\n");
+ }
+#endif
+}
+
+#ifndef NDEBUG
+void SIScheduleBlock::printDebug(bool full) {
+ dbgs() << "Block (" << ID << ")\n";
+ if (!full)
+ return;
+
+ dbgs() << "\nContains High Latency Instruction: "
+ << HighLatencyBlock << '\n';
+ dbgs() << "\nDepends On:\n";
+ for (SIScheduleBlock* P : Preds) {
+ P->printDebug(false);
+ }
+
+ dbgs() << "\nSuccessors:\n";
+ for (SIScheduleBlock* S : Succs) {
+ S->printDebug(false);
+ }
+
+ if (Scheduled) {
+ dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
+ << LiveInPressure[DAG->getVGPRSetID()] << '\n';
+ dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
+ << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+ dbgs() << "LiveIns:\n";
+ for (unsigned Reg : LiveInRegs)
+ dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+
+ dbgs() << "\nLiveOuts:\n";
+ for (unsigned Reg : LiveOutRegs)
+ dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ }
+
+ dbgs() << "\nInstructions:\n";
+ if (!Scheduled) {
+ for (SUnit* SU : SUnits) {
+ SU->dump(DAG);
+ }
+ } else {
+ for (SUnit* SU : SUnits) {
+ SU->dump(DAG);
+ }
+ }
+
+ dbgs() << "///////////////////////\n";
+}
+
+#endif
+
+// SIScheduleBlockCreator //
+
+SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
+DAG(DAG) {
+}
+
+SIScheduleBlockCreator::~SIScheduleBlockCreator() {
+}
+
+SIScheduleBlocks
+SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
+ std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
+ Blocks.find(BlockVariant);
+ if (B == Blocks.end()) {
+ SIScheduleBlocks Res;
+ createBlocksForVariant(BlockVariant);
+ topologicalSort();
+ scheduleInsideBlocks();
+ fillStats();
+ Res.Blocks = CurrentBlocks;
+ Res.TopDownIndex2Block = TopDownIndex2Block;
+ Res.TopDownBlock2Index = TopDownBlock2Index;
+ Blocks[BlockVariant] = Res;
+ return Res;
+ } else {
+ return B->second;
+ }
+}
+
+bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
+ if (SU->NodeNum >= DAG->SUnits.size())
+ return false;
+ return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesAlone() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ if (DAG->IsHighLatencySU[SU->NodeNum]) {
+ CurrentColoring[SU->NodeNum] = NextReservedID++;
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorHighLatenciesGroups() {
+ unsigned DAGSize = DAG->SUnits.size();
+ unsigned NumHighLatencies = 0;
+ unsigned GroupSize;
+ unsigned Color = NextReservedID;
+ unsigned Count = 0;
+ std::set<unsigned> FormingGroup;
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ if (DAG->IsHighLatencySU[SU->NodeNum])
+ ++NumHighLatencies;
+ }
+
+ if (NumHighLatencies == 0)
+ return;
+
+ if (NumHighLatencies <= 6)
+ GroupSize = 2;
+ else if (NumHighLatencies <= 12)
+ GroupSize = 3;
+ else
+ GroupSize = 4;
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ if (DAG->IsHighLatencySU[SU->NodeNum]) {
+ unsigned CompatibleGroup = true;
+ unsigned ProposedColor = Color;
+ for (unsigned j : FormingGroup) {
+ // TODO: Currently CompatibleGroup will always be false,
+ // because the graph enforces the load order. This
+ // can be fixed, but as keeping the load order is often
+ // good for performance that causes a performance hit (both
+ // the default scheduler and this scheduler).
+ // When this scheduler determines a good load order,
+ // this can be fixed.
+ if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
+ !DAG->canAddEdge(&DAG->SUnits[j], SU))
+ CompatibleGroup = false;
+ }
+ if (!CompatibleGroup || ++Count == GroupSize) {
+ FormingGroup.clear();
+ Color = ++NextReservedID;
+ if (!CompatibleGroup) {
+ ProposedColor = Color;
+ FormingGroup.insert(SU->NodeNum);
+ }
+ Count = 0;
+ } else {
+ FormingGroup.insert(SU->NodeNum);
+ }
+ CurrentColoring[SU->NodeNum] = ProposedColor;
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorComputeReservedDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<std::set<unsigned>, unsigned> ColorCombinations;
+
+ CurrentTopDownReservedDependencyColoring.clear();
+ CurrentBottomUpReservedDependencyColoring.clear();
+
+ CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
+ CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
+
+ // Traverse TopDown, and give different colors to SUs depending
+ // on which combination of High Latencies they depend on.
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]];
+ std::set<unsigned> SUColors;
+
+ // Already given.
+ if (CurrentColoring[SU->NodeNum]) {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ CurrentColoring[SU->NodeNum];
+ continue;
+ }
+
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+ continue;
+ if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
+ SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
+ }
+ // Color 0 by default.
+ if (SUColors.empty())
+ continue;
+ // Same color than parents.
+ if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ *SUColors.begin();
+ else {
+ std::map<std::set<unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
+ NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+ }
+
+ ColorCombinations.clear();
+
+ // Same as before, but BottomUp.
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+ std::set<unsigned> SUColors;
+
+ // Already given.
+ if (CurrentColoring[SU->NodeNum]) {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ CurrentColoring[SU->NodeNum];
+ continue;
+ }
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
+ SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
+ }
+ // Keep color 0.
+ if (SUColors.empty())
+ continue;
+ // Same color than parents.
+ if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ *SUColors.begin();
+ else {
+ std::map<std::set<unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
+ NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
+
+ // Every combination of colors given by the top down
+ // and bottom up Reserved node dependency
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ std::pair<unsigned, unsigned> SUColors;
+
+ // High latency instructions: already given.
+ if (CurrentColoring[SU->NodeNum])
+ continue;
+
+ SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
+ SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+
+ std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
+ ColorCombinations.find(SUColors);
+ if (Pos != ColorCombinations.end()) {
+ CurrentColoring[SU->NodeNum] = Pos->second;
+ } else {
+ CurrentColoring[SU->NodeNum] = NextNonReservedID;
+ ColorCombinations[SUColors] = NextNonReservedID++;
+ }
+ }
+}
+
+void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::vector<int> PendingColoring = CurrentColoring;
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+ std::set<unsigned> SUColors;
+ std::set<unsigned> SUColorsPending;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
+ CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
+ CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1 && SUColorsPending.size() == 1)
+ PendingColoring[SU->NodeNum] = *SUColors.begin();
+ else // TODO: Attribute new colors depending on color
+ // combination of children.
+ PendingColoring[SU->NodeNum] = NextNonReservedID++;
+ }
+ CurrentColoring = PendingColoring;
+}
+
+
+void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+ unsigned PreviousColor;
+ std::set<unsigned> SeenColors;
+
+ if (DAGSize <= 1)
+ return;
+
+ PreviousColor = CurrentColoring[0];
+
+ for (unsigned i = 1, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ unsigned CurrentColor = CurrentColoring[i];
+ unsigned PreviousColorSave = PreviousColor;
+ assert(i == SU->NodeNum);
+
+ if (CurrentColor != PreviousColor)
+ SeenColors.insert(PreviousColor);
+ PreviousColor = CurrentColor;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (SeenColors.find(CurrentColor) == SeenColors.end())
+ continue;
+
+ if (PreviousColorSave != CurrentColor)
+ CurrentColoring[i] = NextNonReservedID++;
+ else
+ CurrentColoring[i] = CurrentColoring[i-1];
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ // No predecessor: Vgpr constant loading.
+ // Low latency instructions usually have a predecessor (the address)
+ if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
+ unsigned DAGSize = DAG->SUnits.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ }
+}
+
+void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<unsigned, unsigned> ColorCount;
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+ unsigned color = CurrentColoring[SU->NodeNum];
+ std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
+ if (Pos != ColorCount.end()) {
+ ++ColorCount[color];
+ } else {
+ ColorCount[color] = 1;
+ }
+ }
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+ unsigned color = CurrentColoring[SU->NodeNum];
+ std::set<unsigned> SUColors;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ if (ColorCount[color] > 1)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ SUColors.insert(CurrentColoring[Succ->NodeNum]);
+ }
+ if (SUColors.size() == 1 && *SUColors.begin() != color) {
+ --ColorCount[color];
+ CurrentColoring[SU->NodeNum] = *SUColors.begin();
+ ++ColorCount[*SUColors.begin()];
+ }
+ }
+}
+
+void SIScheduleBlockCreator::cutHugeBlocks() {
+ // TODO
+}
+
+void SIScheduleBlockCreator::regroupNoUserInstructions() {
+ unsigned DAGSize = DAG->SUnits.size();
+ int GroupID = NextNonReservedID++;
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]];
+ bool hasSuccessor = false;
+
+ if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
+ continue;
+
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ hasSuccessor = true;
+ }
+ if (!hasSuccessor)
+ CurrentColoring[SU->NodeNum] = GroupID;
+ }
+}
+
+void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
+ unsigned DAGSize = DAG->SUnits.size();
+ std::map<unsigned,unsigned> RealID;
+
+ CurrentBlocks.clear();
+ CurrentColoring.clear();
+ CurrentColoring.resize(DAGSize, 0);
+ Node2CurrentBlock.clear();
+
+ // Restore links previous scheduling variant has overridden.
+ DAG->restoreSULinksLeft();
+
+ NextReservedID = 1;
+ NextNonReservedID = DAGSize + 1;
+
+ DEBUG(dbgs() << "Coloring the graph\n");
+
+ if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
+ colorHighLatenciesGroups();
+ else
+ colorHighLatenciesAlone();
+ colorComputeReservedDependencies();
+ colorAccordingToReservedDependencies();
+ colorEndsAccordingToDependencies();
+ if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
+ colorForceConsecutiveOrderInGroup();
+ regroupNoUserInstructions();
+ colorMergeConstantLoadsNextGroup();
+ colorMergeIfPossibleNextGroupOnlyForReserved();
+
+ // Put SUs of same color into same block
+ Node2CurrentBlock.resize(DAGSize, -1);
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ unsigned Color = CurrentColoring[SU->NodeNum];
+ if (RealID.find(Color) == RealID.end()) {
+ int ID = CurrentBlocks.size();
+ BlockPtrs.push_back(
+ make_unique<SIScheduleBlock>(DAG, this, ID));
+ CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
+ RealID[Color] = ID;
+ }
+ CurrentBlocks[RealID[Color]]->addUnit(SU);
+ Node2CurrentBlock[SU->NodeNum] = RealID[Color];
+ }
+
+ // Build dependencies between blocks.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &DAG->SUnits[i];
+ int SUID = Node2CurrentBlock[i];
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
+ continue;
+ if (Node2CurrentBlock[Succ->NodeNum] != SUID)
+ CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+ }
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
+ continue;
+ if (Node2CurrentBlock[Pred->NodeNum] != SUID)
+ CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
+ }
+ }
+
+ // Free root and leafs of all blocks to enable scheduling inside them.
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->finalizeUnits();
+ }
+ DEBUG(
+ dbgs() << "Blocks created:\n\n";
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ }
+ );
+}
+
+// Two functions taken from Codegen/MachineScheduler.cpp
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+ MachineBasicBlock::const_iterator End) {
+ for(; I != End; ++I) {
+ if (!I->isDebugValue())
+ break;
+ }
+ return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+ MachineBasicBlock::const_iterator End) {
+ // Cast the return value to nonconst MachineInstr, then cast to an
+ // instr_iterator, which does not check for null, finally return a
+ // bundle_iterator.
+ return MachineBasicBlock::instr_iterator(
+ const_cast<MachineInstr*>(
+ &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+void SIScheduleBlockCreator::topologicalSort() {
+ unsigned DAGSize = CurrentBlocks.size();
+ std::vector<int> WorkList;
+
+ DEBUG(dbgs() << "Topological Sort\n");
+
+ WorkList.reserve(DAGSize);
+ TopDownIndex2Block.resize(DAGSize);
+ TopDownBlock2Index.resize(DAGSize);
+ BottomUpIndex2Block.resize(DAGSize);
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ unsigned Degree = Block->getSuccs().size();
+ TopDownBlock2Index[i] = Degree;
+ if (Degree == 0) {
+ WorkList.push_back(i);
+ }
+ }
+
+ int Id = DAGSize;
+ while (!WorkList.empty()) {
+ int i = WorkList.back();
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ WorkList.pop_back();
+ TopDownBlock2Index[i] = --Id;
+ TopDownIndex2Block[Id] = i;
+ for (SIScheduleBlock* Pred : Block->getPreds()) {
+ if (!--TopDownBlock2Index[Pred->getID()])
+ WorkList.push_back(Pred->getID());
+ }
+ }
+
+#ifndef NDEBUG
+ // Check correctness of the ordering.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ for (SIScheduleBlock* Pred : Block->getPreds()) {
+ assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
+ "Wrong Top Down topological sorting");
+ }
+ }
+#endif
+
+ BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
+ TopDownIndex2Block.rend());
+}
+
+void SIScheduleBlockCreator::scheduleInsideBlocks() {
+ unsigned DAGSize = CurrentBlocks.size();
+
+ DEBUG(dbgs() << "\nScheduling Blocks\n\n");
+
+ // We do schedule a valid scheduling such that a Block corresponds
+ // to a range of instructions.
+ DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->fastSchedule();
+ }
+
+ // Note: the following code, and the part restoring previous position
+ // is by far the most expensive operation of the Scheduler.
+
+ // Do not update CurrentTop.
+ MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
+ std::vector<MachineBasicBlock::iterator> PosOld;
+ std::vector<MachineBasicBlock::iterator> PosNew;
+ PosOld.reserve(DAG->SUnits.size());
+ PosNew.reserve(DAG->SUnits.size());
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = TopDownIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+ for (SUnit* SU : SUs) {
+ MachineInstr *MI = SU->getInstr();
+ MachineBasicBlock::iterator Pos = MI;
+ PosOld.push_back(Pos);
+ if (&*CurrentTopFastSched == MI) {
+ PosNew.push_back(Pos);
+ CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
+ DAG->getCurrentBottom());
+ } else {
+ // Update the instruction stream.
+ DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
+
+ // Update LiveIntervals.
+ // Note: Moving all instructions and calling handleMove everytime
+ // is the most cpu intensive operation of the scheduler.
+ // It would gain a lot if there was a way to recompute the
+ // LiveIntervals for the entire scheduling region.
+ DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true);
+ PosNew.push_back(CurrentTopFastSched);
+ }
+ }
+ }
+
+ // Now we have Block of SUs == Block of MI.
+ // We do the final schedule for the instructions inside the block.
+ // The property that all the SUs of the Block are grouped together as MI
+ // is used for correct reg usage tracking.
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+ Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
+ }
+
+ DEBUG(dbgs() << "Restoring MI Pos\n");
+ // Restore old ordering (which prevents a LIS->handleMove bug).
+ for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
+ MachineBasicBlock::iterator POld = PosOld[i-1];
+ MachineBasicBlock::iterator PNew = PosNew[i-1];
+ if (PNew != POld) {
+ // Update the instruction stream.
+ DAG->getBB()->splice(POld, DAG->getBB(), PNew);
+
+ // Update LiveIntervals.
+ DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true);
+ }
+ }
+
+ DEBUG(
+ for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = CurrentBlocks[i];
+ Block->printDebug(true);
+ }
+ );
+}
+
+void SIScheduleBlockCreator::fillStats() {
+ unsigned DAGSize = CurrentBlocks.size();
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = TopDownIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ if (Block->getPreds().size() == 0)
+ Block->Depth = 0;
+ else {
+ unsigned Depth = 0;
+ for (SIScheduleBlock *Pred : Block->getPreds()) {
+ if (Depth < Pred->Depth + 1)
+ Depth = Pred->Depth + 1;
+ }
+ Block->Depth = Depth;
+ }
+ }
+
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ int BlockIndice = BottomUpIndex2Block[i];
+ SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
+ if (Block->getSuccs().size() == 0)
+ Block->Height = 0;
+ else {
+ unsigned Height = 0;
+ for (SIScheduleBlock *Succ : Block->getSuccs()) {
+ if (Height < Succ->Height + 1)
+ Height = Succ->Height + 1;
+ }
+ Block->Height = Height;
+ }
+ }
+}
+
+// SIScheduleBlockScheduler //
+
+SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+ SISchedulerBlockSchedulerVariant Variant,
+ SIScheduleBlocks BlocksStruct) :
+ DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
+ LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
+ SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
+
+ // Fill the usage of every output
+ // Warning: while by construction we always have a link between two blocks
+ // when one needs a result from the other, the number of users of an output
+ // is not the sum of child blocks having as input the same virtual register.
+ // Here is an example. A produces x and y. B eats x and produces x'.
+ // C eats x' and y. The register coalescer may have attributed the same
+ // virtual register to x and x'.
+ // To count accurately, we do a topological sort. In case the register is
+ // found for several parents, we increment the usage of the one with the
+ // highest topological index.
+ LiveOutRegsNumUsages.resize(Blocks.size());
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ for (unsigned Reg : Block->getInRegs()) {
+ bool Found = false;
+ int topoInd = -1;
+ for (SIScheduleBlock* Pred: Block->getPreds()) {
+ std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+ std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+ if (RegPos != PredOutRegs.end()) {
+ Found = true;
+ if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
+ topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
+ }
+ }
+ }
+
+ if (!Found)
+ continue;
+
+ int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
+ std::map<unsigned, unsigned>::iterator RegPos =
+ LiveOutRegsNumUsages[PredID].find(Reg);
+ if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
+ ++LiveOutRegsNumUsages[PredID][Reg];
+ } else {
+ LiveOutRegsNumUsages[PredID][Reg] = 1;
+ }
+ }
+ }
+
+ LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
+ BlockNumPredsLeft.resize(Blocks.size());
+ BlockNumSuccsLeft.resize(Blocks.size());
+
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ BlockNumPredsLeft[i] = Block->getPreds().size();
+ BlockNumSuccsLeft[i] = Block->getSuccs().size();
+ }
+
+#ifndef NDEBUG
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ assert(Block->getID() == i);
+ }
+#endif
+
+ std::set<unsigned> InRegs = DAG->getInRegs();
+ addLiveRegs(InRegs);
+
+ // Fill LiveRegsConsumers for regs that were already
+ // defined before scheduling.
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ for (unsigned Reg : Block->getInRegs()) {
+ bool Found = false;
+ for (SIScheduleBlock* Pred: Block->getPreds()) {
+ std::set<unsigned> PredOutRegs = Pred->getOutRegs();
+ std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+
+ if (RegPos != PredOutRegs.end()) {
+ Found = true;
+ break;
+ }
+ }
+
+ if (!Found) {
+ if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
+ LiveRegsConsumers[Reg] = 1;
+ else
+ ++LiveRegsConsumers[Reg];
+ }
+ }
+ }
+
+ for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+ SIScheduleBlock *Block = Blocks[i];
+ if (BlockNumPredsLeft[i] == 0) {
+ ReadyBlocks.push_back(Block);
+ }
+ }
+
+ while (SIScheduleBlock *Block = pickBlock()) {
+ BlocksScheduled.push_back(Block);
+ blockScheduled(Block);
+ }
+
+ DEBUG(
+ dbgs() << "Block Order:";
+ for (SIScheduleBlock* Block : BlocksScheduled) {
+ dbgs() << ' ' << Block->getID();
+ }
+ );
+}
+
+bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand) {
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Try to hide high latencies.
+ if (tryLess(TryCand.LastPosHighLatParentScheduled,
+ Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
+ return true;
+ // Schedule high latencies early so you can hide them better.
+ if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
+ TryCand, Cand, Latency))
+ return true;
+ if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
+ TryCand, Cand, Depth))
+ return true;
+ if (tryGreater(TryCand.NumHighLatencySuccessors,
+ Cand.NumHighLatencySuccessors,
+ TryCand, Cand, Successor))
+ return true;
+ return false;
+}
+
+bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand) {
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
+ TryCand, Cand, RegUsage))
+ return true;
+ if (tryGreater(TryCand.NumSuccessors > 0,
+ Cand.NumSuccessors > 0,
+ TryCand, Cand, Successor))
+ return true;
+ if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
+ return true;
+ if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
+ TryCand, Cand, RegUsage))
+ return true;
+ return false;
+}
+
+SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
+ SIBlockSchedCandidate Cand;
+ std::vector<SIScheduleBlock*>::iterator Best;
+ SIScheduleBlock *Block;
+ if (ReadyBlocks.empty())
+ return nullptr;
+
+ DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
+ VregCurrentUsage, SregCurrentUsage);
+ if (VregCurrentUsage > maxVregUsage)
+ maxVregUsage = VregCurrentUsage;
+ if (VregCurrentUsage > maxSregUsage)
+ maxSregUsage = VregCurrentUsage;
+ DEBUG(
+ dbgs() << "Picking New Blocks\n";
+ dbgs() << "Available: ";
+ for (SIScheduleBlock* Block : ReadyBlocks)
+ dbgs() << Block->getID() << ' ';
+ dbgs() << "\nCurrent Live:\n";
+ for (unsigned Reg : LiveRegs)
+ dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+ dbgs() << '\n';
+ dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
+ dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
+ );
+
+ Cand.Block = nullptr;
+ for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
+ E = ReadyBlocks.end(); I != E; ++I) {
+ SIBlockSchedCandidate TryCand;
+ TryCand.Block = *I;
+ TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
+ TryCand.VGPRUsageDiff =
+ checkRegUsageImpact(TryCand.Block->getInRegs(),
+ TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+ TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
+ TryCand.NumHighLatencySuccessors =
+ TryCand.Block->getNumHighLatencySuccessors();
+ TryCand.LastPosHighLatParentScheduled =
+ (unsigned int) std::max<int> (0,
+ LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
+ LastPosWaitedHighLatency);
+ TryCand.Height = TryCand.Block->Height;
+ // Try not to increase VGPR usage too much, else we may spill.
+ if (VregCurrentUsage > 120 ||
+ Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
+ if (!tryCandidateRegUsage(Cand, TryCand) &&
+ Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
+ tryCandidateLatency(Cand, TryCand);
+ } else {
+ if (!tryCandidateLatency(Cand, TryCand))
+ tryCandidateRegUsage(Cand, TryCand);
+ }
+ if (TryCand.Reason != NoCand) {
+ Cand.setBest(TryCand);
+ Best = I;
+ DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
+ << getReasonStr(Cand.Reason) << '\n');
+ }
+ }
+
+ DEBUG(
+ dbgs() << "Picking: " << Cand.Block->getID() << '\n';
+ dbgs() << "Is a block with high latency instruction: "
+ << (Cand.IsHighLatency ? "yes\n" : "no\n");
+ dbgs() << "Position of last high latency dependency: "
+ << Cand.LastPosHighLatParentScheduled << '\n';
+ dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
+ dbgs() << '\n';
+ );
+
+ Block = Cand.Block;
+ ReadyBlocks.erase(Best);
+ return Block;
+}
+
+// Tracking of currently alive registers to determine VGPR Usage.
+
+void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+ for (unsigned Reg : Regs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ // If not already in the live set, then add it.
+ (void) LiveRegs.insert(Reg);
+ }
+}
+
+void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
+ std::set<unsigned> &Regs) {
+ for (unsigned Reg : Regs) {
+ // For now only track virtual registers.
+ std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+ assert (Pos != LiveRegs.end() && // Reg must be live.
+ LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
+ LiveRegsConsumers[Reg] >= 1);
+ --LiveRegsConsumers[Reg];
+ if (LiveRegsConsumers[Reg] == 0)
+ LiveRegs.erase(Pos);
+ }
+}
+
+void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
+ for (SIScheduleBlock* Block : Parent->getSuccs()) {
+ --BlockNumPredsLeft[Block->getID()];
+ if (BlockNumPredsLeft[Block->getID()] == 0) {
+ ReadyBlocks.push_back(Block);
+ }
+ // TODO: Improve check. When the dependency between the high latency
+ // instructions and the instructions of the other blocks are WAR or WAW
+ // there will be no wait triggered. We would like these cases to not
+ // update LastPosHighLatencyParentScheduled.
+ if (Parent->isHighLatencyBlock())
+ LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+ }
+}
+
+void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
+ decreaseLiveRegs(Block, Block->getInRegs());
+ addLiveRegs(Block->getOutRegs());
+ releaseBlockSuccs(Block);
+ for (std::map<unsigned, unsigned>::iterator RegI =
+ LiveOutRegsNumUsages[Block->getID()].begin(),
+ E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
+ std::pair<unsigned, unsigned> RegP = *RegI;
+ if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
+ LiveRegsConsumers[RegP.first] = RegP.second;
+ else {
+ assert(LiveRegsConsumers[RegP.first] == 0);
+ LiveRegsConsumers[RegP.first] += RegP.second;
+ }
+ }
+ if (LastPosHighLatencyParentScheduled[Block->getID()] >
+ (unsigned)LastPosWaitedHighLatency)
+ LastPosWaitedHighLatency =
+ LastPosHighLatencyParentScheduled[Block->getID()];
+ ++NumBlockScheduled;
+}
+
+std::vector<int>
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
+ std::set<unsigned> &OutRegs) {
+ std::vector<int> DiffSetPressure;
+ DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
+
+ for (unsigned Reg : InRegs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ if (LiveRegsConsumers[Reg] > 1)
+ continue;
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ DiffSetPressure[*PSetI] -= PSetI.getWeight();
+ }
+ }
+
+ for (unsigned Reg : OutRegs) {
+ // For now only track virtual registers.
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ DiffSetPressure[*PSetI] += PSetI.getWeight();
+ }
+ }
+
+ return DiffSetPressure;
+}
+
+// SIScheduler //
+
+struct SIScheduleBlockResult
+SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+ SISchedulerBlockSchedulerVariant ScheduleVariant) {
+ SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
+ SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
+ std::vector<SIScheduleBlock*> ScheduledBlocks;
+ struct SIScheduleBlockResult Res;
+
+ ScheduledBlocks = Scheduler.getBlocks();
+
+ for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
+ SIScheduleBlock *Block = ScheduledBlocks[b];
+ std::vector<SUnit*> SUs = Block->getScheduledUnits();
+
+ for (SUnit* SU : SUs)
+ Res.SUs.push_back(SU->NodeNum);
+ }
+
+ Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
+ Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
+ return Res;
+}
+
+// SIScheduleDAGMI //
+
+SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
+ ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) {
+ SITII = static_cast<const SIInstrInfo*>(TII);
+ SITRI = static_cast<const SIRegisterInfo*>(TRI);
+
+ VGPRSetID = SITRI->getVGPR32PressureSet();
+ SGPRSetID = SITRI->getSGPR32PressureSet();
+}
+
+SIScheduleDAGMI::~SIScheduleDAGMI() {
+}
+
+ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) {
+ return new SIScheduleDAGMI(C);
+}
+
+// Code adapted from scheduleDAG.cpp
+// Does a topological sort over the SUs.
+// Both TopDown and BottomUp
+void SIScheduleDAGMI::topologicalSort() {
+ std::vector<int> TopDownSU2Index;
+ unsigned DAGSize = SUnits.size();
+ std::vector<SUnit*> WorkList;
+
+ DEBUG(dbgs() << "Topological Sort\n");
+ WorkList.reserve(DAGSize);
+
+ TopDownIndex2SU.resize(DAGSize);
+ TopDownSU2Index.resize(DAGSize);
+ BottomUpIndex2SU.resize(DAGSize);
+
+ WorkList.push_back(&getExitSU());
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &SUnits[i];
+ int NodeNum = SU->NodeNum;
+ unsigned Degree = SU->Succs.size();
+ TopDownSU2Index[NodeNum] = Degree;
+ if (Degree == 0) {
+ assert(SU->Succs.empty() && "SUnit should have no successors");
+ WorkList.push_back(SU);
+ }
+ }
+
+ int Id = DAGSize;
+ while (!WorkList.empty()) {
+ SUnit *SU = WorkList.back();
+ WorkList.pop_back();
+ if (SU->NodeNum < DAGSize) {
+ TopDownSU2Index[SU->NodeNum] = --Id;
+ TopDownIndex2SU[Id] = SU->NodeNum;
+ }
+ for (SDep& Pred : SU->Preds) {
+ SUnit *SU = Pred.getSUnit();
+ if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum])
+ WorkList.push_back(SU);
+ }
+ }
+
+ BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(),
+ TopDownIndex2SU.rend());
+
+#ifndef NDEBUG
+ // Check correctness of the ordering
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &SUnits[i];
+ for (SDep& Pred : SU->Preds) {
+ if (Pred.getSUnit()->NodeNum >= DAGSize)
+ continue;
+ assert(TopDownSU2Index[SU->NodeNum] >
+ TopDownSU2Index[Pred.getSUnit()->NodeNum] &&
+ "Wrong Top Down topological sorting");
+ }
+ }
+ for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+ SUnit *SU = &SUnits[i];
+ for (SDep& Succ : SU->Succs) {
+ if (Succ.getSUnit()->NodeNum >= DAGSize)
+ continue;
+ assert(TopDownSU2Index[SU->NodeNum] <
+ TopDownSU2Index[Succ.getSUnit()->NodeNum] &&
+ "Wrong Bottom Up topological sorting");
+ }
+ }
+#endif
+}
+
+// Move low latencies further from their user without
+// increasing SGPR usage (in general)
+// This is to be replaced by a better pass that would
+// take into account SGPR usage (based on VGPR Usage
+// and the corresponding wavefront count), that would
+// try to merge groups of loads if it make sense, etc
+void SIScheduleDAGMI::moveLowLatencies() {
+ unsigned DAGSize = SUnits.size();
+ int LastLowLatencyUser = -1;
+ int LastLowLatencyPos = -1;
+
+ for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
+ SUnit *SU = &SUnits[ScheduledSUnits[i]];
+ bool IsLowLatencyUser = false;
+ unsigned MinPos = 0;
+
+ for (SDep& PredDep : SU->Preds) {
+ SUnit *Pred = PredDep.getSUnit();
+ if (SITII->isLowLatencyInstruction(Pred->getInstr())) {
+ IsLowLatencyUser = true;
+ }
+ if (Pred->NodeNum >= DAGSize)
+ continue;
+ unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
+ if (PredPos >= MinPos)
+ MinPos = PredPos + 1;
+ }
+
+ if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+ unsigned BestPos = LastLowLatencyUser + 1;
+ if ((int)BestPos <= LastLowLatencyPos)
+ BestPos = LastLowLatencyPos + 1;
+ if (BestPos < MinPos)
+ BestPos = MinPos;
+ if (BestPos < i) {
+ for (unsigned u = i; u > BestPos; --u) {
+ ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+ ScheduledSUnits[u] = ScheduledSUnits[u-1];
+ }
+ ScheduledSUnits[BestPos] = SU->NodeNum;
+ ScheduledSUnitsInv[SU->NodeNum] = BestPos;
+ }
+ LastLowLatencyPos = BestPos;
+ if (IsLowLatencyUser)
+ LastLowLatencyUser = BestPos;
+ } else if (IsLowLatencyUser) {
+ LastLowLatencyUser = i;
+ // Moves COPY instructions on which depends
+ // the low latency instructions too.
+ } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
+ bool CopyForLowLat = false;
+ for (SDep& SuccDep : SU->Succs) {
+ SUnit *Succ = SuccDep.getSUnit();
+ if (SITII->isLowLatencyInstruction(Succ->getInstr())) {
+ CopyForLowLat = true;
+ }
+ }
+ if (!CopyForLowLat)
+ continue;
+ if (MinPos < i) {
+ for (unsigned u = i; u > MinPos; --u) {
+ ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
+ ScheduledSUnits[u] = ScheduledSUnits[u-1];
+ }
+ ScheduledSUnits[MinPos] = SU->NodeNum;
+ ScheduledSUnitsInv[SU->NodeNum] = MinPos;
+ }
+ }
+ }
+}
+
+void SIScheduleDAGMI::restoreSULinksLeft() {
+ for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+ SUnits[i].isScheduled = false;
+ SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
+ SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
+ SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
+ SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
+ }
+}
+
+// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
+template<typename _Iterator> void
+SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
+ unsigned &VgprUsage, unsigned &SgprUsage) {
+ VgprUsage = 0;
+ SgprUsage = 0;
+ for (_Iterator RegI = First; RegI != End; ++RegI) {
+ unsigned Reg = *RegI;
+ // For now only track virtual registers
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ continue;
+ PSetIterator PSetI = MRI.getPressureSets(Reg);
+ for (; PSetI.isValid(); ++PSetI) {
+ if (*PSetI == VGPRSetID)
+ VgprUsage += PSetI.getWeight();
+ else if (*PSetI == SGPRSetID)
+ SgprUsage += PSetI.getWeight();
+ }
+ }
+}
+
+void SIScheduleDAGMI::schedule()
+{
+ SmallVector<SUnit*, 8> TopRoots, BotRoots;
+ SIScheduleBlockResult Best, Temp;
+ DEBUG(dbgs() << "Preparing Scheduling\n");
+
+ buildDAGWithRegPressure();
+ DEBUG(
+ for(SUnit& SU : SUnits)
+ SU.dumpAll(this)
+ );
+
+ Topo.InitDAGTopologicalSorting();
+ topologicalSort();
+ findRootsAndBiasEdges(TopRoots, BotRoots);
+ // We reuse several ScheduleDAGMI and ScheduleDAGMILive
+ // functions, but to make them happy we must initialize
+ // the default Scheduler implementation (even if we do not
+ // run it)
+ SchedImpl->initialize(this);
+ initQueues(TopRoots, BotRoots);
+
+ // Fill some stats to help scheduling.
+
+ SUnitsLinksBackup = SUnits;
+ IsLowLatencySU.clear();
+ LowLatencyOffset.clear();
+ IsHighLatencySU.clear();
+
+ IsLowLatencySU.resize(SUnits.size(), 0);
+ LowLatencyOffset.resize(SUnits.size(), 0);
+ IsHighLatencySU.resize(SUnits.size(), 0);
+
+ for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+ SUnit *SU = &SUnits[i];
+ unsigned BaseLatReg, OffLatReg;
+ if (SITII->isLowLatencyInstruction(SU->getInstr())) {
+ IsLowLatencySU[i] = 1;
+ if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg,
+ OffLatReg, TRI))
+ LowLatencyOffset[i] = OffLatReg;
+ } else if (SITII->isHighLatencyInstruction(SU->getInstr()))
+ IsHighLatencySU[i] = 1;
+ }
+
+ SIScheduler Scheduler(this);
+ Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
+ SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
+#if 0 // To enable when handleMove fix lands
+ // if VGPR usage is extremely high, try other good performing variants
+ // which could lead to lower VGPR usage
+ if (Best.MaxVGPRUsage > 180) {
+ std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+ { LatenciesAlone, BlockRegUsageLatency },
+// { LatenciesAlone, BlockRegUsage },
+ { LatenciesGrouped, BlockLatencyRegUsage },
+// { LatenciesGrouped, BlockRegUsageLatency },
+// { LatenciesGrouped, BlockRegUsage },
+ { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+// { LatenciesAlonePlusConsecutive, BlockRegUsage }
+ };
+ for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+ Temp = Scheduler.scheduleVariant(v.first, v.second);
+ if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+ Best = Temp;
+ }
+ }
+ // if VGPR usage is still extremely high, we may spill. Try other variants
+ // which are less performing, but that could lead to lower VGPR usage.
+ if (Best.MaxVGPRUsage > 200) {
+ std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+// { LatenciesAlone, BlockRegUsageLatency },
+ { LatenciesAlone, BlockRegUsage },
+// { LatenciesGrouped, BlockLatencyRegUsage },
+ { LatenciesGrouped, BlockRegUsageLatency },
+ { LatenciesGrouped, BlockRegUsage },
+// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
+ { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
+ { LatenciesAlonePlusConsecutive, BlockRegUsage }
+ };
+ for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
+ Temp = Scheduler.scheduleVariant(v.first, v.second);
+ if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
+ Best = Temp;
+ }
+ }
+#endif
+ ScheduledSUnits = Best.SUs;
+ ScheduledSUnitsInv.resize(SUnits.size());
+
+ for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
+ ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
+ }
+
+ moveLowLatencies();
+
+ // Tell the outside world about the result of the scheduling.
+
+ assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+ TopRPTracker.setPos(CurrentTop);
+
+ for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
+ E = ScheduledSUnits.end(); I != E; ++I) {
+ SUnit *SU = &SUnits[*I];
+
+ scheduleMI(SU, true);
+
+ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
+ }
+
+ assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+ placeDebugValues();
+
+ DEBUG({
+ unsigned BBNum = begin()->getParent()->getNumber();
+ dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+ dumpSchedule();
+ dbgs() << '\n';
+ });
+}
--- /dev/null
+//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+enum SIScheduleCandReason {
+ NoCand,
+ RegUsage,
+ Latency,
+ Successor,
+ Depth,
+ NodeOrder
+};
+
+struct SISchedulerCandidate {
+ // The reason for this candidate.
+ SIScheduleCandReason Reason;
+
+ // Set of reasons that apply to multiple candidates.
+ uint32_t RepeatReasonSet;
+
+ SISchedulerCandidate()
+ : Reason(NoCand), RepeatReasonSet(0) {}
+
+ bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
+ void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
+};
+
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
+
+class SIScheduleBlock {
+ SIScheduleDAGMI *DAG;
+ SIScheduleBlockCreator *BC;
+
+ std::vector<SUnit*> SUnits;
+ std::map<unsigned, unsigned> NodeNum2Index;
+ std::vector<SUnit*> TopReadySUs;
+ std::vector<SUnit*> ScheduledSUnits;
+
+ /// The top of the unscheduled zone.
+ IntervalPressure TopPressure;
+ RegPressureTracker TopRPTracker;
+
+ // Pressure: number of said class of registers needed to
+ // store the live virtual and real registers.
+ // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
+ // Pressure of additional registers required inside the block.
+ std::vector<unsigned> InternalAdditionnalPressure;
+ // Pressure of input and output registers
+ std::vector<unsigned> LiveInPressure;
+ std::vector<unsigned> LiveOutPressure;
+ // Registers required by the block, and outputs.
+ // We do track only virtual registers.
+ // Note that some registers are not 32 bits,
+ // and thus the pressure is not equal
+ // to the number of live registers.
+ std::set<unsigned> LiveInRegs;
+ std::set<unsigned> LiveOutRegs;
+
+ bool Scheduled;
+ bool HighLatencyBlock;
+
+ std::vector<unsigned> HasLowLatencyNonWaitedParent;
+
+ // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
+ unsigned ID;
+
+ std::vector<SIScheduleBlock*> Preds; // All blocks predecessors.
+ std::vector<SIScheduleBlock*> Succs; // All blocks successors.
+ unsigned NumHighLatencySuccessors;
+
+public:
+ SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
+ unsigned ID):
+ DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(),
+ TopRPTracker(TopPressure), Scheduled(false),
+ HighLatencyBlock(false), ID(ID),
+ Preds(), Succs(), NumHighLatencySuccessors(0) {};
+
+ ~SIScheduleBlock() {};
+
+ unsigned getID() const { return ID; }
+
+ /// Functions for Block construction.
+ void addUnit(SUnit *SU);
+
+ // When all SUs have been added.
+ void finalizeUnits();
+
+ // Add block pred, which has instruction predecessor of SU.
+ void addPred(SIScheduleBlock *Pred);
+ void addSucc(SIScheduleBlock *Succ);
+
+ const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
+ const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+
+ unsigned Height; // Maximum topdown path length to block without outputs
+ unsigned Depth; // Maximum bottomup path length to block without inputs
+
+ unsigned getNumHighLatencySuccessors() const {
+ return NumHighLatencySuccessors;
+ }
+
+ bool isHighLatencyBlock() { return HighLatencyBlock; }
+
+ // This is approximative.
+ // Ideally should take into accounts some instructions (rcp, etc)
+ // are 4 times slower.
+ int getCost() { return SUnits.size(); }
+
+ // The block Predecessors and Successors must be all registered
+ // before fastSchedule().
+ // Fast schedule with no particular requirement.
+ void fastSchedule();
+
+ std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
+
+ // Complete schedule that will try to minimize reg pressure and
+ // low latencies, and will fill liveins and liveouts.
+ // Needs all MIs to be grouped between BeginBlock and EndBlock.
+ // The MIs can be moved after the scheduling,
+ // it is just used to allow correct track of live registers.
+ void schedule(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock);
+
+ bool isScheduled() { return Scheduled; }
+
+
+ // Needs the block to be scheduled inside
+ // TODO: find a way to compute it.
+ std::vector<unsigned> &getInternalAdditionnalRegUsage() {
+ return InternalAdditionnalPressure;
+ }
+
+ std::set<unsigned> &getInRegs() { return LiveInRegs; }
+ std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
+
+ void printDebug(bool Full);
+
+private:
+ struct SISchedCandidate : SISchedulerCandidate {
+ // The best SUnit candidate.
+ SUnit *SU;
+
+ unsigned SGPRUsage;
+ unsigned VGPRUsage;
+ bool IsLowLatency;
+ unsigned LowLatencyOffset;
+ bool HasLowLatencyNonWaitedParent;
+
+ SISchedCandidate()
+ : SU(nullptr) {}
+
+ bool isValid() const { return SU; }
+
+ // Copy the status of another candidate without changing policy.
+ void setBest(SISchedCandidate &Best) {
+ assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+ SU = Best.SU;
+ Reason = Best.Reason;
+ SGPRUsage = Best.SGPRUsage;
+ VGPRUsage = Best.VGPRUsage;
+ IsLowLatency = Best.IsLowLatency;
+ LowLatencyOffset = Best.LowLatencyOffset;
+ HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
+ }
+ };
+
+ void undoSchedule();
+
+ void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
+ void releaseSucc(SUnit *SU, SDep *SuccEdge);
+ // InOrOutBlock: restrict to links pointing inside the block (true),
+ // or restrict to links pointing outside the block (false).
+ void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
+
+ void nodeScheduled(SUnit *SU);
+ void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+ void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+ SUnit* pickNode();
+ void traceCandidate(const SISchedCandidate &Cand);
+ void initRegPressure(MachineBasicBlock::iterator BeginBlock,
+ MachineBasicBlock::iterator EndBlock);
+};
+
+struct SIScheduleBlocks {
+ std::vector<SIScheduleBlock*> Blocks;
+ std::vector<int> TopDownIndex2Block;
+ std::vector<int> TopDownBlock2Index;
+};
+
+enum SISchedulerBlockCreatorVariant {
+ LatenciesAlone,
+ LatenciesGrouped,
+ LatenciesAlonePlusConsecutive
+};
+
+class SIScheduleBlockCreator {
+ SIScheduleDAGMI *DAG;
+ // unique_ptr handles freeing memory for us.
+ std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
+ std::map<SISchedulerBlockCreatorVariant,
+ SIScheduleBlocks> Blocks;
+ std::vector<SIScheduleBlock*> CurrentBlocks;
+ std::vector<int> Node2CurrentBlock;
+
+ // Topological sort
+ // Maps topological index to the node number.
+ std::vector<int> TopDownIndex2Block;
+ std::vector<int> TopDownBlock2Index;
+ std::vector<int> BottomUpIndex2Block;
+
+ // 0 -> Color not given.
+ // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
+ // Above -> Other groups.
+ int NextReservedID;
+ int NextNonReservedID;
+ std::vector<int> CurrentColoring;
+ std::vector<int> CurrentTopDownReservedDependencyColoring;
+ std::vector<int> CurrentBottomUpReservedDependencyColoring;
+
+public:
+ SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
+ ~SIScheduleBlockCreator();
+
+ SIScheduleBlocks
+ getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
+
+ bool isSUInBlock(SUnit *SU, unsigned ID);
+
+private:
+ // Give a Reserved color to every high latency.
+ void colorHighLatenciesAlone();
+
+ // Create groups of high latencies with a Reserved color.
+ void colorHighLatenciesGroups();
+
+ // Compute coloring for topdown and bottom traversals with
+ // different colors depending on dependencies on Reserved colors.
+ void colorComputeReservedDependencies();
+
+ // Give color to all non-colored SUs according to Reserved groups dependencies.
+ void colorAccordingToReservedDependencies();
+
+ // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
+ // The new colors are computed according to the dependencies on the other blocks
+ // formed with colorAccordingToReservedDependencies.
+ void colorEndsAccordingToDependencies();
+
+ // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
+ void colorForceConsecutiveOrderInGroup();
+
+ // Merge Constant loads that have all their users into another group to the group.
+ // (TODO: else if all their users depend on the same group, put them there)
+ void colorMergeConstantLoadsNextGroup();
+
+ // Merge SUs that have all their users into another group to the group
+ void colorMergeIfPossibleNextGroup();
+
+ // Merge SUs that have all their users into another group to the group,
+ // but only for Reserved groups.
+ void colorMergeIfPossibleNextGroupOnlyForReserved();
+
+ // Merge SUs that have all their users into another group to the group,
+ // but only if the group is no more than a few SUs.
+ void colorMergeIfPossibleSmallGroupsToNextGroup();
+
+ // Divides Blocks with important size.
+ // Idea of implementation: attribute new colors depending on topdown and
+ // bottom up links to other blocks.
+ void cutHugeBlocks();
+
+ // Put in one group all instructions with no users in this scheduling region
+ // (we'd want these groups be at the end).
+ void regroupNoUserInstructions();
+
+ void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
+
+ void topologicalSort();
+
+ void scheduleInsideBlocks();
+
+ void fillStats();
+};
+
+enum SISchedulerBlockSchedulerVariant {
+ BlockLatencyRegUsage,
+ BlockRegUsageLatency,
+ BlockRegUsage
+};
+
+class SIScheduleBlockScheduler {
+ SIScheduleDAGMI *DAG;
+ SISchedulerBlockSchedulerVariant Variant;
+ std::vector<SIScheduleBlock*> Blocks;
+
+ std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
+ std::set<unsigned> LiveRegs;
+ // Num of schedulable unscheduled blocks reading the register.
+ std::map<unsigned, unsigned> LiveRegsConsumers;
+
+ std::vector<unsigned> LastPosHighLatencyParentScheduled;
+ int LastPosWaitedHighLatency;
+
+ std::vector<SIScheduleBlock*> BlocksScheduled;
+ unsigned NumBlockScheduled;
+ std::vector<SIScheduleBlock*> ReadyBlocks;
+
+ unsigned VregCurrentUsage;
+ unsigned SregCurrentUsage;
+
+ // Currently is only approximation.
+ unsigned maxVregUsage;
+ unsigned maxSregUsage;
+
+ std::vector<unsigned> BlockNumPredsLeft;
+ std::vector<unsigned> BlockNumSuccsLeft;
+
+public:
+ SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+ SISchedulerBlockSchedulerVariant Variant,
+ SIScheduleBlocks BlocksStruct);
+ ~SIScheduleBlockScheduler() {};
+
+ std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; };
+
+ unsigned getVGPRUsage() { return maxVregUsage; };
+ unsigned getSGPRUsage() { return maxSregUsage; };
+
+private:
+ struct SIBlockSchedCandidate : SISchedulerCandidate {
+ // The best Block candidate.
+ SIScheduleBlock *Block;
+
+ bool IsHighLatency;
+ int VGPRUsageDiff;
+ unsigned NumSuccessors;
+ unsigned NumHighLatencySuccessors;
+ unsigned LastPosHighLatParentScheduled;
+ unsigned Height;
+
+ SIBlockSchedCandidate()
+ : Block(nullptr) {}
+
+ bool isValid() const { return Block; }
+
+ // Copy the status of another candidate without changing policy.
+ void setBest(SIBlockSchedCandidate &Best) {
+ assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+ Block = Best.Block;
+ Reason = Best.Reason;
+ IsHighLatency = Best.IsHighLatency;
+ VGPRUsageDiff = Best.VGPRUsageDiff;
+ NumSuccessors = Best.NumSuccessors;
+ NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
+ LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
+ Height = Best.Height;
+ }
+ };
+
+ bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand);
+ bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+ SIBlockSchedCandidate &TryCand);
+ SIScheduleBlock *pickBlock();
+
+ void addLiveRegs(std::set<unsigned> &Regs);
+ void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+ void releaseBlockSuccs(SIScheduleBlock *Parent);
+ void blockScheduled(SIScheduleBlock *Block);
+
+ // Check register pressure change
+ // by scheduling a block with these LiveIn and LiveOut.
+ std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
+ std::set<unsigned> &OutRegs);
+
+ void schedule();
+};
+
+struct SIScheduleBlockResult {
+ std::vector<unsigned> SUs;
+ unsigned MaxSGPRUsage;
+ unsigned MaxVGPRUsage;
+};
+
+class SIScheduler {
+ SIScheduleDAGMI *DAG;
+ SIScheduleBlockCreator BlockCreator;
+
+public:
+ SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {};
+
+ ~SIScheduler() {};
+
+ struct SIScheduleBlockResult
+ scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+ SISchedulerBlockSchedulerVariant ScheduleVariant);
+};
+
+class SIScheduleDAGMI : public ScheduleDAGMILive {
+ const SIInstrInfo *SITII;
+ const SIRegisterInfo *SITRI;
+
+ std::vector<SUnit> SUnitsLinksBackup;
+
+ // For moveLowLatencies. After all Scheduling variants are tested.
+ std::vector<unsigned> ScheduledSUnits;
+ std::vector<unsigned> ScheduledSUnitsInv;
+
+ unsigned VGPRSetID;
+ unsigned SGPRSetID;
+
+public:
+ SIScheduleDAGMI(MachineSchedContext *C);
+
+ ~SIScheduleDAGMI() override;
+
+ // Entry point for the schedule.
+ void schedule() override;
+
+ // To init Block's RPTracker.
+ void initRPTracker(RegPressureTracker &RPTracker) {
+ RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+ }
+
+ MachineBasicBlock *getBB() { return BB; }
+ MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; };
+ MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; };
+ LiveIntervals *getLIS() { return LIS; }
+ MachineRegisterInfo *getMRI() { return &MRI; }
+ const TargetRegisterInfo *getTRI() { return TRI; }
+ SUnit& getEntrySU() { return EntrySU; };
+ SUnit& getExitSU() { return ExitSU; };
+
+ void restoreSULinksLeft();
+
+ template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
+ _Iterator End,
+ unsigned &VgprUsage,
+ unsigned &SgprUsage);
+ std::set<unsigned> getInRegs() {
+ std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(),
+ RPTracker.getPressure().LiveInRegs.end());
+ return InRegs;
+ };
+
+ unsigned getVGPRSetID() const { return VGPRSetID; }
+ unsigned getSGPRSetID() const { return SGPRSetID; }
+
+private:
+ void topologicalSort();
+ // After scheduling is done, improve low latency placements.
+ void moveLowLatencies();
+
+public:
+ // Some stats for scheduling inside blocks.
+ std::vector<unsigned> IsLowLatencySU;
+ std::vector<unsigned> LowLatencyOffset;
+ std::vector<unsigned> IsHighLatencySU;
+ // Topological sort
+ // Maps topological index to the node number.
+ std::vector<int> TopDownIndex2SU;
+ std::vector<int> BottomUpIndex2SU;
+};
+
+} // namespace llvm
+
+#endif /* SIMACHINESCHEDULER_H_ */
using namespace llvm;
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {}
+SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {
+ unsigned NumRegPressureSets = getNumRegPressureSets();
+
+ SGPR32SetID = NumRegPressureSets;
+ VGPR32SetID = NumRegPressureSets;
+ for (unsigned i = 0; i < NumRegPressureSets; ++i) {
+ if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0)
+ SGPR32SetID = i;
+ else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
+ VGPR32SetID = i;
+ }
+ assert(SGPR32SetID < NumRegPressureSets &&
+ VGPR32SetID < NumRegPressureSets);
+}
void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
MCRegAliasIterator R(Reg, this, true);
const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.hasSGPRInitBug()) {
- unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+ // Leave space for flat_scr, xnack_mask, vcc, and alignment
+ unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
- // next sgpr128 down.
+ // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
+ // 100/101 for vcc. This is the next sgpr128 down.
return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
}
const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.hasSGPRInitBug()) {
- unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+ unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
return AMDGPU::SGPR_32RegClass.getRegister(Idx);
}
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- // Next register before reservations for flat_scr and vcc.
- return AMDGPU::SGPR97;
+ // Next register before reservations for flat_scr, xnack_mask, vcc,
+ // and scratch resource.
+ return AMDGPU::SGPR91;
}
return AMDGPU::SGPR95;
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
- // for VCC/FLAT_SCR.
+ // for VCC/XNACK_MASK/FLAT_SCR.
+ //
+ // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
+ // SGPRs when the XNACK feature is not used. This is currently not done
+ // because the code that counts SGPRs cannot account for such holes.
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
}
// to a hw bug.
if (ST.hasSGPRInitBug()) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
- // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
- // Assume XNACK_MASK is unused.
- unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;
+ // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
+ unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
for (unsigned i = Limit; i < NumSGPRs; ++i) {
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
return Fn.getFrameInfo()->hasStackObjects();
}
+bool
+SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+ return MF.getFrameInfo()->hasStackObjects();
+}
+
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
switch (Op) {
RegScavenger *RS) const {
MachineBasicBlock *MBB = MI->getParent();
- const MachineFunction *MF = MI->getParent()->getParent();
+ MachineFunction *MF = MI->getParent()->getParent();
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
- LLVMContext &Ctx = MF->getFunction()->getContext();
DebugLoc DL = MI->getDebugLoc();
bool IsLoad = TII->get(LoadStoreOp).mayLoad();
bool RanOutOfSGPRs = false;
+ bool Scavenged = false;
unsigned SOffset = ScratchOffset;
+ unsigned OriginalImmOffset = Offset;
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
unsigned Size = NumSubRegs * 4;
if (!isUInt<12>(Offset + Size)) {
- SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
+ SOffset = AMDGPU::NoRegister;
+
+ // We don't have access to the register scavenger if this function is called
+ // during PEI::scavengeFrameVirtualRegs().
+ if (RS)
+ SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
+
if (SOffset == AMDGPU::NoRegister) {
+ // There are no free SGPRs, and since we are in the process of spilling
+ // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
+ // on SI/CI and on VI it is true until we implement spilling using scalar
+ // stores), we have no way to free up an SGPR. Our solution here is to
+ // add the offset directly to the ScratchOffset register, and then
+ // subtract the offset after the spill to return ScratchOffset to it's
+ // original value.
RanOutOfSGPRs = true;
- SOffset = AMDGPU::SGPR0;
+ SOffset = ScratchOffset;
+ } else {
+ Scavenged = true;
}
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
.addReg(ScratchOffset)
Offset = 0;
}
- if (RanOutOfSGPRs)
- Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
-
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
unsigned SubReg = NumSubRegs > 1 ?
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
Value;
+ unsigned SOffsetRegState = 0;
+ if (i + 1 == e && Scavenged)
+ SOffsetRegState |= RegState::Kill;
+
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
.addReg(SubReg, getDefRegState(IsLoad))
.addReg(ScratchRsrcReg)
- .addReg(SOffset)
+ .addReg(SOffset, SOffsetRegState)
.addImm(Offset)
.addImm(0) // glc
.addImm(0) // slc
.addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
}
+
+ if (RanOutOfSGPRs) {
+ // Subtract the offset we added to the ScratchOffset register.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
+ .addReg(ScratchOffset)
+ .addImm(OriginalImmOffset);
+ }
}
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineFunction *MF = MI->getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE: {
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
struct SIMachineFunctionInfo::SpilledReg Spill =
MFI->getSpilledReg(MF, Index, i);
- if (Spill.VGPR == AMDGPU::NoRegister) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+ if (Spill.hasReg()) {
+ BuildMI(*MBB, MI, DL,
+ TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill.VGPR)
+ .addReg(SubReg)
+ .addImm(Spill.Lane);
+
+ // FIXME: Since this spills to another register instead of an actual
+ // frame index, we should delete the frame index when all references to
+ // it are fixed.
+ } else {
+ // Spill SGPR to a frame index.
+ // FIXME we should use S_STORE_DWORD here for VI.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ .addReg(SubReg);
+
+ unsigned Size = FrameInfo->getObjectSize(Index);
+ unsigned Align = FrameInfo->getObjectAlignment(Index);
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index);
+ MachineMemOperand *MMO
+ = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ Size, Align);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
+ .addReg(TmpReg) // src
+ .addFrameIndex(Index) // frame_idx
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addImm(i * 4) // offset
+ .addMemOperand(MMO);
}
-
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
- Spill.VGPR)
- .addReg(SubReg)
- .addImm(Spill.Lane);
-
- // FIXME: Since this spills to another register instead of an actual
- // frame index, we should delete the frame index when all references to
- // it are fixed.
}
MI->eraseFromParent();
break;
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE: {
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
struct SIMachineFunctionInfo::SpilledReg Spill =
MFI->getSpilledReg(MF, Index, i);
- if (Spill.VGPR == AMDGPU::NoRegister) {
- LLVMContext &Ctx = MF->getFunction()->getContext();
- Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+ if (Spill.hasReg()) {
+ BuildMI(*MBB, MI, DL,
+ TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ SubReg)
+ .addReg(Spill.VGPR)
+ .addImm(Spill.Lane)
+ .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+ } else {
+ // Restore SGPR from a stack slot.
+ // FIXME: We should use S_LOAD_DWORD here for VI.
+
+ unsigned Align = FrameInfo->getObjectAlignment(Index);
+ unsigned Size = FrameInfo->getObjectSize(Index);
+
+ MachinePointerInfo PtrInfo
+ = MachinePointerInfo::getFixedStack(*MF, Index);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+ .addFrameIndex(Index) // frame_idx
+ .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+ .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
+ .addImm(i * 4) // offset
+ .addMemOperand(MMO);
+ BuildMI(*MBB, MI, DL,
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
+ .addReg(TmpReg, RegState::Kill)
+ .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
}
-
- BuildMI(*MBB, MI, DL,
- TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
- SubReg)
- .addReg(Spill.VGPR)
- .addImm(Spill.Lane)
- .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
}
// TODO: only do this when it is needed
TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
- FrameInfo->getObjectOffset(Index), RS);
+ FrameInfo->getObjectOffset(Index) +
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
MI->eraseFromParent();
break;
case AMDGPU::SI_SPILL_V32_RESTORE:
TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
- FrameInfo->getObjectOffset(Index), RS);
+ FrameInfo->getObjectOffset(Index) +
+ TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
MI->eraseFromParent();
break;
}
int64_t Offset = FrameInfo->getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
- unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj);
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
}
}
+const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
+ const TargetRegisterClass *VRC) const {
+ switch (VRC->getSize()) {
+ case 4:
+ return &AMDGPU::SGPR_32RegClass;
+ case 8:
+ return &AMDGPU::SReg_64RegClass;
+ case 16:
+ return &AMDGPU::SReg_128RegClass;
+ case 32:
+ return &AMDGPU::SReg_256RegClass;
+ case 64:
+ return &AMDGPU::SReg_512RegClass;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
+}
+
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
const TargetRegisterClass *RC, unsigned SubIdx) const {
if (SubIdx == AMDGPU::NoSubRegister)
return RC;
- // If this register has a sub-register, we can safely assume it is a 32-bit
- // register, because all of SI's sub-registers are 32-bit.
+ // We can assume that each lane corresponds to one 32-bit register.
+ unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx));
if (isSGPRClass(RC)) {
- return &AMDGPU::SGPR_32RegClass;
+ switch (Count) {
+ case 1:
+ return &AMDGPU::SGPR_32RegClass;
+ case 2:
+ return &AMDGPU::SReg_64RegClass;
+ case 4:
+ return &AMDGPU::SReg_128RegClass;
+ case 8:
+ return &AMDGPU::SReg_256RegClass;
+ case 16: /* fall-through */
+ default:
+ llvm_unreachable("Invalid sub-register class size");
+ }
} else {
- return &AMDGPU::VGPR_32RegClass;
+ switch (Count) {
+ case 1:
+ return &AMDGPU::VGPR_32RegClass;
+ case 2:
+ return &AMDGPU::VReg_64RegClass;
+ case 3:
+ return &AMDGPU::VReg_96RegClass;
+ case 4:
+ return &AMDGPU::VReg_128RegClass;
+ case 8:
+ return &AMDGPU::VReg_256RegClass;
+ case 16: /* fall-through */
+ default:
+ llvm_unreachable("Invalid sub-register class size");
+ }
}
}
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;
+ case SIRegisterInfo::DISPATCH_ID:
+ llvm_unreachable("unimplemented");
+ case SIRegisterInfo::FLAT_SCRATCH_INIT:
+ assert(MFI->hasFlatScratchInit());
+ return MFI->FlatScratchInitUserSGPR;
case SIRegisterInfo::DISPATCH_PTR:
assert(MFI->hasDispatchPtr());
return MFI->DispatchPtrUserSGPR;
struct SIRegisterInfo : public AMDGPURegisterInfo {
private:
+ unsigned SGPR32SetID;
+ unsigned VGPR32SetID;
+
void reserveRegisterTuples(BitVector &, unsigned Reg) const;
public:
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
+ bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const override;
const TargetRegisterClass *getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const;
+ /// \returns A SGPR reg class with the same width as \p SRC
+ const TargetRegisterClass *getEquivalentSGPRClass(
+ const TargetRegisterClass *VRC) const;
+
/// \returns The register class that is used for a sub-register of \p RC for
/// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will
/// be returned.
enum PreloadedValue {
// SGPRS:
- PRIVATE_SEGMENT_BUFFER = 0,
+ PRIVATE_SEGMENT_BUFFER = 0,
DISPATCH_PTR = 1,
QUEUE_PTR = 2,
KERNARG_SEGMENT_PTR = 3,
+ DISPATCH_ID = 4,
+ FLAT_SCRATCH_INIT = 5,
WORKGROUP_ID_X = 10,
WORKGROUP_ID_Y = 11,
WORKGROUP_ID_Z = 12,
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC) const;
+ unsigned getSGPR32PressureSet() const { return SGPR32SetID; };
+ unsigned getVGPR32PressureSet() const { return VGPR32SetID; };
+
private:
void buildScratchLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp, unsigned Value,
unsigned ScratchRsrcReg, unsigned ScratchOffset,
- int64_t Offset, RegScavenger *RS) const;
+ int64_t Offset,
+ RegScavenger *RS) const;
};
} // End namespace llvm
SmallVector <Type*, 8> Types;
bool NeedToReplace = false;
Function *F = I.getCalledFunction();
+ if (!F)
+ return;
+
std::string Name = F->getName();
for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
Value *Arg = I.getArgOperand(i);
if (Features.test(FeatureISAVersion8_0_1))
return {8, 0, 1};
+ if (Features.test(FeatureISAVersion8_0_3))
+ return {8, 0, 3};
+
return {0, 0, 0};
}
return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
}
-static const char ShaderTypeAttribute[] = "ShaderType";
-
-unsigned getShaderType(const Function &F) {
- Attribute A = F.getFnAttribute(ShaderTypeAttribute);
- unsigned ShaderType = ShaderType::COMPUTE;
+static unsigned getIntegerAttribute(const Function &F, const char *Name,
+ unsigned Default) {
+ Attribute A = F.getFnAttribute(Name);
+ unsigned Result = Default;
if (A.isStringAttribute()) {
StringRef Str = A.getValueAsString();
- if (Str.getAsInteger(0, ShaderType)) {
+ if (Str.getAsInteger(0, Result)) {
LLVMContext &Ctx = F.getContext();
Ctx.emitError("can't parse shader type");
}
}
- return ShaderType;
+ return Result;
+}
+
+unsigned getShaderType(const Function &F) {
+ return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE);
+}
+
+unsigned getInitialPSInputAddr(const Function &F) {
+ return getIntegerAttribute(F, "InitialPSInputAddr", 0);
}
bool isSI(const MCSubtargetInfo &STI) {
bool isReadOnlySegment(const GlobalValue *GV);
unsigned getShaderType(const Function &F);
+unsigned getInitialPSInputAddr(const Function &F);
+
bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI
-//===----------------------------------------------------------------------===//
-// SMEM Patterns
-//===----------------------------------------------------------------------===//
-
let Predicates = [isVI] in {
// 1. Offset as 20bit DWORD immediate
(S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
>;
-// Patterns for global loads with no offset
-class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr)),
- (inst $addr, 0, 0, 0)
->;
-
-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
-
-class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (node vt:$data, i64:$addr),
- (inst $data, $addr, 0, 0, 0)
->;
-
-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
-
-class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
- (vt (node i64:$addr, vt:$data)),
- (inst $addr, $data, 0, 0)
->;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-
} // End Predicates = [isVI]