From 1e3c4790abc355a619d0c778bfd5f599e5ec2df4 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Mon, 18 Feb 2019 23:00:26 +0000 Subject: [PATCH] AMDGPU: Use MachineInstr::mayAlias to replace areMemAccessesTriviallyDisjoint in LoadStoreOptimizer pass. Summary: This is to fix a memory dependence bug in LoadStoreOptimizer. Reviewers: arsenm, rampitec Differential Revision: https://reviews.llvm.org/D58295 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@354295 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIInstrInfo.cpp | 11 -- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 18 ++- test/CodeGen/AMDGPU/ds-combine-with-dependence.ll | 130 ++++++++++++++++++++++ 3 files changed, 138 insertions(+), 21 deletions(-) create mode 100644 test/CodeGen/AMDGPU/ds-combine-with-dependence.ll diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 0e3048792aa..8abf91b8ae2 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2215,17 +2215,6 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; - if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { - const MachineMemOperand *MMOa = *MIa.memoperands_begin(); - const MachineMemOperand *MMOb = *MIb.memoperands_begin(); - if (MMOa->getValue() && MMOb->getValue()) { - MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); - MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); - if (!AA->alias(LocA, LocB)) - return true; - } - } - // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 6fc587bf0d6..f6bb7b3196f 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -256,13 +256,11 @@ static void addDefsUsesToList(const MachineInstr &MI, static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, - const SIInstrInfo *TII, AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder - return !(A->mayStore() || B->mayStore()) || - TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); + return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); } // Add MI and its defs to the lists if MI reads one of the defs that are @@ -294,13 +292,13 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef InstsToMove, - const SIInstrInfo *TII, AliasAnalysis *AA) { + AliasAnalysis *AA) { assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; - if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) + if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) return false; } return true; @@ -566,8 +564,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -646,7 +644,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // move and make sure they are all safe to move down past the merged // instruction. if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) - if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) return true; } @@ -655,8 +653,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // it was safe to move I and also all the instruction in InstsToMove // down past this instruction. // check if we can move I across MBBI and if we can move all I's users - if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) break; } return false; diff --git a/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll new file mode 100644 index 00000000000..06fa048124a --- /dev/null +++ b/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll @@ -0,0 +1,130 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + + +; There is no dependence between the store and the two loads. So we can combine the loads +; and the combined load is at the original place of the second load. + +; GCN-LABEL: {{^}}ds_combine_nodep + +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 +; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8 +define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) { + + %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* + %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24 + %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* + %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* + %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 + %v0 = extractelement <3 x float> %load0, i32 2 + + %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 + %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 + + %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 + %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* + store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 + + %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7 + %v1 = load float, float addrspace(3)* %vaddr1, align 4 + + %sum = fadd float %v0, %v1 + store float %sum, float addrspace(1)* %out, align 4 + ret void +} + + +; The store depends on the first load, so we could not move the first load down to combine with +; the second load directly. However, we can move the store after the combined load. + +; GCN-LABEL: {{^}}ds_combine_WAR + +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27 +; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 +define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) { + + %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* + %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100 + %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* + %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* + %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 + %v0 = extractelement <3 x float> %load0, i32 2 + + %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 + %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 + + %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 + %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* + store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 + + %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7 + %v1 = load float, float addrspace(3)* %vaddr1, align 4 + + %sum = fadd float %v0, %v1 + store float %sum, float addrspace(1)* %out, align 4 + ret void +} + + +; The second load depends on the store. We can combine the two loads, and the combined load is +; at the original place of the second load. + +; GCN-LABEL: {{^}}ds_combine_RAW + +; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 +; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26 +define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) { + + %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* + %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24 + %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* + %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* + %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 + %v0 = extractelement <3 x float> %load0, i32 2 + + %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 + %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 + + %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 + %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* + store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 + + %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26 + %v1 = load float, float addrspace(3)* %vaddr1, align 4 + + %sum = fadd float %v0, %v1 + store float %sum, float addrspace(1)* %out, align 4 + ret void +} + + +; The store depends on the first load, also the second load depends on the store. +; So we can not combine the two loads. + +; GCN-LABEL: {{^}}ds_combine_WAR_RAW + +; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108 +; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27 +; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104 +define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) { + + %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)* + %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100 + %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)* + %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)* + %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4 + %v0 = extractelement <3 x float> %load0, i32 2 + + %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0 + %data = insertelement <2 x float> %tmp1, float 2.0, i32 1 + + %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26 + %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)* + store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4 + + %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26 + %v1 = load float, float addrspace(3)* %vaddr1, align 4 + + %sum = fadd float %v0, %v1 + store float %sum, float addrspace(1)* %out, align 4 + ret void +} -- 2.11.0