From 2ac1fa00c935017e365dcd21e13c8fab2f0a70c0 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Tue, 7 Jun 2016 21:37:17 +0000 Subject: [PATCH] AMDGPU: Add amdgpu-ps-wqm-outputs function attributes Summary: The presence of this attribute indicates that VGPR outputs should be computed in whole quad mode. This will be used by Mesa for prolog pixel shaders, so that derivatives can be taken of shader inputs computed by the prolog, fixing a bug. The generated code could certainly be improved: if a prolog pixel shader is used (which isn't common in modern OpenGL - they're used for gl_Color, polygon stipples, and forcing per-sample interpolation), Mesa will use this attribute unconditionally, because it has to be conservative. So WQM may be used in the prolog when it isn't really needed, and furthermore a silly back-and-forth switch is likely to happen at the boundary between prolog and main shader parts. Fixing this is a bit involved: we'd first have to add a mechanism by which LLVM writes the WQM-related input requirements to the main shader part binary, and then Mesa specializes the prolog part accordingly. At that point, we may as well just compile a monolithic shader... Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=95130 Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, llvm-commits, kzhuravl Differential Revision: http://reviews.llvm.org/D20839 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272063 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIWholeQuadMode.cpp | 29 +++++++++++++++++++++++++++-- test/CodeGen/AMDGPU/wqm.ll | 14 ++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp index dd133d37eb7..79796853497 100644 --- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -154,6 +154,7 @@ FunctionPass *llvm::createSIWholeQuadModePass() { char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector &Worklist) { char GlobalFlags = 0; + bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; @@ -161,7 +162,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; unsigned Opcode = MI.getOpcode(); - char Flags; + char Flags = 0; if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { Flags = StateWQM; @@ -175,15 +176,39 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, ExecExports.push_back(&MI); } else if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); + } else if (WQMOutputs) { + // The function is in machine SSA form, which means that physical + // VGPRs correspond to shader inputs and outputs. Inputs are + // only used, outputs are only defined. + for (const MachineOperand &MO : MI.defs()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + + if (!TRI->isVirtualRegister(Reg) && + TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { + Flags = StateWQM; + break; + } + } } - continue; + if (!Flags) + continue; } Instructions[&MI].Needs = Flags; Worklist.push_back(&MI); GlobalFlags |= Flags; } + + if (WQMOutputs && MBB.succ_empty()) { + // This is a prolog shader. Make sure we go back to exact mode at the end. + Blocks[&MBB].OutNeeds = StateExact; + Worklist.push_back(&MBB); + GlobalFlags |= StateExact; + } } return GlobalFlags; diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll index 30915a82923..4eab0aec56b 100644 --- a/test/CodeGen/AMDGPU/wqm.ll +++ b/test/CodeGen/AMDGPU/wqm.ll @@ -332,6 +332,19 @@ main_body: ret <4 x float> %tex } +; Check prolog shaders. +; +; CHECK-LABEL: {{^}}test_prolog_1: +; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +; CHECK: s_wqm_b64 exec, exec +; CHECK: v_add_f32_e32 v0, +; CHECK: s_and_b64 exec, exec, [[ORIG]] +define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 { +main_body: + %s = fadd float %a, %b + ret float %s +} + declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 @@ -345,3 +358,4 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float attributes #1 = { nounwind } attributes #2 = { nounwind readonly } attributes #3 = { nounwind readnone } +attributes #4 = { "amdgpu-ps-wqm-outputs" } -- 2.11.0