From fee41517fe0f7ff9f0e204dd9200ebf32ca03cb8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 17 Feb 2020 11:47:45 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Introduce post-legalize combiner The current set of custom combines are only really useful after legalization, so move them there. There is a lot of overlap in the boilerplate here, but I think we do want a pretty different set of combines before and after legalize. I think we will want a lot of overlap between the post-legalize and a post-regbankselect combiner. --- llvm/lib/Target/AMDGPU/AMDGPU.h | 2 + llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 9 +- .../Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 261 +++++++++++++++++++++ .../Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 116 +-------- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 5 +- .../AMDGPU/GlobalISel/combine-ashr-narrow.mir | 2 +- .../AMDGPU/GlobalISel/combine-lshr-narrow.mir | 2 +- .../AMDGPU/GlobalISel/combine-shl-narrow.mir | 2 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll | 21 ++ llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll | 21 ++ .../GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll | 4 +- llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll | 61 ++--- llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll | 28 +-- 14 files changed, 378 insertions(+), 163 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 1859cfaba5b..bae5e4059c3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -30,6 +30,8 @@ class Module; // GlobalISel passes void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &); FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone); +void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &); +FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone); // R600 Passes FunctionPass *createR600VectorRegMerger(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 3f298fd1daf..4063bb1fca2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -26,7 +26,12 @@ def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, - elide_br_by_inverting_cond, - gfx6gfx7_combines]> { + elide_br_by_inverting_cond]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; } + +def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< + "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, + gfx6gfx7_combines]> { + let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp new file mode 100644 index 00000000000..8f43e6e9f0e --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -0,0 +1,261 @@ +//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does combining of machine instructions at the generic MI level, +// after the legalizer. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPULegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" + +#define DEBUG_TYPE "amdgpu-postlegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +struct FMinFMaxLegacyInfo { + Register LHS; + Register RHS; + Register True; + Register False; + CmpInst::Predicate Pred; +}; + +// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize +static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, FMinFMaxLegacyInfo &Info) { + // FIXME: Combines should have subtarget predicates, and we shouldn't need + // this here. + if (!MF.getSubtarget().hasFminFmaxLegacy()) + return false; + + // FIXME: Type predicate on pattern + if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) + return false; + + Register Cond = MI.getOperand(1).getReg(); + if (!MRI.hasOneNonDBGUse(Cond) || + !mi_match(Cond, MRI, + m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) + return false; + + Info.True = MI.getOperand(2).getReg(); + Info.False = MI.getOperand(3).getReg(); + + if (!(Info.LHS == Info.True && Info.RHS == Info.False) && + !(Info.LHS == Info.False && Info.RHS == Info.True)) + return false; + + switch (Info.Pred) { + case CmpInst::FCMP_FALSE: + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_ORD: + case CmpInst::FCMP_UNO: + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_UNE: + case CmpInst::FCMP_TRUE: + return false; + default: + return true; + } +} + +static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, + const FMinFMaxLegacyInfo &Info) { + + auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) { + MachineIRBuilder MIB(MI); + MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); + }; + + switch (Info.Pred) { + case CmpInst::FCMP_ULT: + case CmpInst::FCMP_ULE: + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); + break; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_OLT: { + // We need to permute the operands to get the correct NaN behavior. The + // selected operand is the second one based on the failing compare with NaN, + // so permute it based on the compare type the hardware uses. + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); + break; + } + case CmpInst::FCMP_UGE: + case CmpInst::FCMP_UGT: { + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); + break; + } + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_OGE: { + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); + break; + } + default: + llvm_unreachable("predicate should not have matched"); + } + + MI.eraseFromParent(); +} + + +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + +public: + AMDGPUGenPostLegalizerCombinerHelper Generated; + + AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + const AMDGPULegalizerInfo *LI, + GISelKnownBits *KB, MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, + /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!Generated.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B, KB, MDT); + + if (Generated.tryCombineAll(Observer, MI, B, Helper)) + return true; + + switch (MI.getOpcode()) { + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. + return Helper.tryCombineShiftToUnmerge(MI, 32); + } + + return false; +} + +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +// Pass boilerplate +// ================ + +class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AMDGPUPostLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { + return "AMDGPUPostLegalizerCombiner"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + if (!IsOptNone) { + AU.addRequired(); + AU.addPreserved(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + auto *TPC = &getAnalysis(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + + const GCNSubtarget &ST = MF.getSubtarget(); + const AMDGPULegalizerInfo *LI + = static_cast(ST.getLegalizerInfo()); + + GISelKnownBits *KB = &getAnalysis().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis(); + AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), LI, KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AMDGPUPostLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs after legalization", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs after legalization", false, + false) + +namespace llvm { +FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { + return new AMDGPUPostLegalizerCombiner(IsOptNone); +} +} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 2757dde6f25..1c337afadd4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -28,112 +28,13 @@ using namespace llvm; using namespace MIPatternMatch; -struct FMinFMaxLegacyInfo { - Register LHS; - Register RHS; - Register True; - Register False; - CmpInst::Predicate Pred; -}; - -// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize -static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineFunction &MF, FMinFMaxLegacyInfo &Info) { - // FIXME: Combines should have subtarget predicates, and we shouldn't need - // this here. - if (!MF.getSubtarget().hasFminFmaxLegacy()) - return false; - - // FIXME: Type predicate on pattern - if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) - return false; - - Register Cond = MI.getOperand(1).getReg(); - if (!MRI.hasOneNonDBGUse(Cond) || - !mi_match(Cond, MRI, - m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) - return false; - - Info.True = MI.getOperand(2).getReg(); - Info.False = MI.getOperand(3).getReg(); - - if (!(Info.LHS == Info.True && Info.RHS == Info.False) && - !(Info.LHS == Info.False && Info.RHS == Info.True)) - return false; - - switch (Info.Pred) { - case CmpInst::FCMP_FALSE: - case CmpInst::FCMP_OEQ: - case CmpInst::FCMP_ONE: - case CmpInst::FCMP_ORD: - case CmpInst::FCMP_UNO: - case CmpInst::FCMP_UEQ: - case CmpInst::FCMP_UNE: - case CmpInst::FCMP_TRUE: - return false; - default: - return true; - } -} - -static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, - const FMinFMaxLegacyInfo &Info) { - - auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) { - MachineIRBuilder MIB(MI); - MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); - }; - - switch (Info.Pred) { - case CmpInst::FCMP_ULT: - case CmpInst::FCMP_ULE: - if (Info.LHS == Info.True) - buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); - else - buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); - break; - case CmpInst::FCMP_OLE: - case CmpInst::FCMP_OLT: { - // We need to permute the operands to get the correct NaN behavior. The - // selected operand is the second one based on the failing compare with NaN, - // so permute it based on the compare type the hardware uses. - if (Info.LHS == Info.True) - buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); - else - buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); - break; - } - case CmpInst::FCMP_UGE: - case CmpInst::FCMP_UGT: { - if (Info.LHS == Info.True) - buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); - else - buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); - break; - } - case CmpInst::FCMP_OGT: - case CmpInst::FCMP_OGE: { - if (Info.LHS == Info.True) - buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); - else - buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); - break; - } - default: - llvm_unreachable("predicate should not have matched"); - } - - MI.eraseFromParent(); -} - - #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS -#include "AMDGPUGenGICombiner.inc" +#include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS namespace { #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -#include "AMDGPUGenGICombiner.inc" +#include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo { @@ -165,13 +66,6 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, return true; switch (MI.getOpcode()) { - case TargetOpcode::G_SHL: - case TargetOpcode::G_LSHR: - case TargetOpcode::G_ASHR: - // On some subtargets, 64-bit shift is a quarter rate instruction. In the - // common case, splitting this into a move and a 32-bit shift is faster and - // the same code size. - return Helper.tryCombineShiftToUnmerge(MI, 32); case TargetOpcode::G_CONCAT_VECTORS: return Helper.tryCombineConcatVectors(MI); case TargetOpcode::G_SHUFFLE_VECTOR: @@ -182,7 +76,7 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, } #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP -#include "AMDGPUGenGICombiner.inc" +#include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP // Pass boilerplate @@ -194,7 +88,9 @@ public: AMDGPUPreLegalizerCombiner(bool IsOptNone = false); - StringRef getPassName() const override { return "AMDGPUPreLegalizerCombiner"; } + StringRef getPassName() const override { + return "AMDGPUPreLegalizerCombiner"; + } bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 624d099018a..555b215d8e5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -218,6 +218,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); + initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); @@ -623,6 +624,7 @@ public: bool addIRTranslator() override; void addPreLegalizeMachineIR() override; bool addLegalizeMachineIR() override; + void addPreRegBankSelect() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; void addFastRegAlloc() override; @@ -911,6 +913,11 @@ bool GCNPassConfig::addLegalizeMachineIR() { return false; } +void GCNPassConfig::addPreRegBankSelect() { + bool IsOptNone = getOptLevel() == CodeGenOpt::None; + addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); +} + bool GCNPassConfig::addRegBankSelect() { addPass(new RegBankSelect()); return false; diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index bce539dfb6b..972d90db026 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -15,8 +15,10 @@ tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) set(LLVM_TARGET_DEFINITIONS AMDGPUGISel.td) tablegen(LLVM AMDGPUGenGlobalISel.inc -gen-global-isel) -tablegen(LLVM AMDGPUGenGICombiner.inc -gen-global-isel-combiner +tablegen(LLVM AMDGPUGenPreLegalizeGICombiner.inc -gen-global-isel-combiner -combiners="AMDGPUPreLegalizerCombinerHelper") +tablegen(LLVM AMDGPUGenPostLegalizeGICombiner.inc -gen-global-isel-combiner + -combiners="AMDGPUPostLegalizerCombinerHelper") set(LLVM_TARGET_DEFINITIONS R600.td) tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer) @@ -60,6 +62,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp + AMDGPUPostLegalizerCombiner.cpp AMDGPUPreLegalizerCombiner.cpp AMDGPUPromoteAlloca.cpp AMDGPUPropagateAttributes.cpp diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir index bd044c77c38..f57623ff645 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --- name: narrow_ashr_s64_32_s64amt diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir index 3c41f340916..c3f1093a3b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --- name: narrow_lshr_s64_32_s64amt diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir index 313303e155b..41d0260c81f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll index 2ddc379c79f..6ac4dc886db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll @@ -253,3 +253,24 @@ define double @v_test_fmax_legacy_ult_f64(double %a, double %b) { %val = select i1 %cmp, double %b, double %a ret double %val } + +define <2 x float> @v_test_fmax_legacy_ogt_v2f32(<2 x float> %a, <2 x float> %b) { +; GFX6-LABEL: v_test_fmax_legacy_ogt_v2f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_max_legacy_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_max_legacy_f32_e32 v1, v1, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_test_fmax_legacy_ogt_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ogt <2 x float> %a, %b + %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll index d11d05365fc..23b7c9d129a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll @@ -382,3 +382,24 @@ define float @v_test_fcmp_select_false(float %a, float %b) { %val = select i1 %cmp, float %a, float %b ret float %val } + +define <2 x float> @v_test_fmin_legacy_ole_v2f32(<2 x float> %a, <2 x float> %b) { +; GFX6-LABEL: v_test_fmin_legacy_ole_v2f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_min_legacy_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_min_legacy_f32_e32 v1, v1, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_test_fmin_legacy_ole_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cmp_le_f32_e32 vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ole <2 x float> %a, %b + %val = select <2 x i1> %cmp, <2 x float> %a, <2 x float> %b + ret <2 x float> %val +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll index f4ede38b26a..28e4684ffac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -26,9 +26,7 @@ define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 { ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 -; OS-UNKNOWN: s_add_u32 s[[LO:[0-9]+]], s0, 44 -; OS-UNKNOWN-NEXT: s_addc_u32 s[[HI:[0-9]+]], s1, 0 -; OS-UNKNOWN-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[LO]]:[[HI]]{{\]}}, 0xa +; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x15 define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll index e23df92a303..b968982585f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll @@ -286,30 +286,31 @@ define i32 @v_udiv_i32_pow2k_denom(i32 %num) { ; CHECK-LABEL: v_udiv_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s6 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, s6 -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; CHECK-NEXT: s_movk_i32 s4, 0x1000 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, s4 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CHECK-NEXT: v_mul_hi_u32 v3, v3, v2 +; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v2, v3 +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_mul_hi_u32 v2, v2, v1 -; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v1, v2 -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_mul_hi_u32 v1, v1, v0 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, 1, v1 -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v2 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v5 +; CHECK-NEXT: v_mul_hi_u32 v2, v2, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CHECK-NEXT: v_subrev_i32_e32 v5, vcc, 1, v2 +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 ; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; CHECK-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v0, v4, v2, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i32 %num, 4096 ret i32 %result @@ -319,9 +320,9 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CHECK-LABEL: v_udiv_v2i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s8, 0x1000 +; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s8 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s4 ; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -329,9 +330,9 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8 -; CHECK-NEXT: v_mul_hi_u32 v6, v3, s8 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v3, s4 +; CHECK-NEXT: v_lshlrev_b32_e32 v7, 12, v4 ; CHECK-NEXT: v_mul_hi_u32 v8, v4, v2 ; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 ; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 @@ -349,17 +350,17 @@ define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] ; CHECK-NEXT: v_mul_hi_u32 v3, v3, v0 ; CHECK-NEXT: v_mul_hi_u32 v4, v4, v1 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8 +; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v3 ; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 1, v3 -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v8, 12, v4 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v4 ; CHECK-NEXT: v_subrev_i32_e32 v10, vcc, 1, v4 ; CHECK-NEXT: v_sub_i32_e32 v11, vcc, v0, v5 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v1, v8 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v8 -; CHECK-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v11 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v2 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[8:9], v0, v2 ; CHECK-NEXT: s_and_b64 s[6:7], s[6:7], vcc ; CHECK-NEXT: v_cndmask_b32_e64 v0, v6, v3, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 74ba97b6c95..45ce6cdf421 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -286,14 +286,14 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) { ; CHECK-LABEL: v_urem_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x1000 +; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x1000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s6 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s4 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6 -; CHECK-NEXT: v_mul_hi_u32 v4, v2, s6 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, v2, s4 ; CHECK-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc @@ -302,9 +302,9 @@ define i32 @v_urem_i32_pow2k_denom(i32 %num) { ; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CHECK-NEXT: v_mul_hi_u32 v2, v2, v0 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, s6 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 ; CHECK-NEXT: v_add_i32_e64 v4, s[4:5], v3, v1 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 ; CHECK-NEXT: v_sub_i32_e64 v0, s[6:7], v3, v1 @@ -320,9 +320,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) { ; CHECK-LABEL: v_urem_v2i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s8, 0x1000 +; CHECK-NEXT: s_movk_i32 s4, 0x1000 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x1000 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s8 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, s4 ; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 @@ -330,9 +330,9 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) { ; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, s8 -; CHECK-NEXT: v_mul_hi_u32 v6, v3, s8 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v5, 12, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v3, s4 +; CHECK-NEXT: v_lshlrev_b32_e32 v7, 12, v4 ; CHECK-NEXT: v_mul_hi_u32 v8, v4, v2 ; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 ; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v7 @@ -350,11 +350,11 @@ define <2 x i32> @v_urem_v2i32_pow2k_denom(<2 x i32> %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[4:5] ; CHECK-NEXT: v_mul_hi_u32 v3, v3, v0 ; CHECK-NEXT: v_mul_hi_u32 v4, v4, v1 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, s8 -; CHECK-NEXT: v_mul_lo_u32 v4, v4, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v4, 12, v4 ; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v1, v4 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v5, v2 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 ; CHECK-NEXT: v_sub_i32_e64 v0, s[6:7], v5, v2 -- 2.11.0