From abc019968067736a499467f7db7fb758a425ca06 Mon Sep 17 00:00:00 2001 From: Nate Begeman Date: Fri, 5 Jun 2009 21:37:30 +0000 Subject: [PATCH] Adapt the x86 build_vector dagcombine to the current state of the legalizer. build vectors with i64 elements will only appear on 32b x86 before legalize. Since vector widening occurs during legalize, and produces i64 build_vector elements, the dag combiner is never run on these before legalize splits them into 32b elements. Teach the build_vector dag combine in x86 back end to recognize consecutive loads producing the low part of the vector. Convert the two uses of TLI's consecutive load recognizer to pass LoadSDNodes since that was required implicitly. Add a testcase for the transform. Old: subl $28, %esp movl 32(%esp), %eax movl 4(%eax), %ecx movl %ecx, 4(%esp) movl (%eax), %eax movl %eax, (%esp) movaps (%esp), %xmm0 pmovzxwd %xmm0, %xmm0 movl 36(%esp), %eax movaps %xmm0, (%eax) addl $28, %esp ret New: movl 4(%esp), %eax pmovzxwd (%eax), %xmm0 movl 8(%esp), %eax movaps %xmm0, (%eax) ret git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@72957 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 10 +-- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 19 +++-- lib/CodeGen/SelectionDAG/TargetLowering.cpp | 17 +++-- lib/Target/X86/X86ISelLowering.cpp | 111 +++++++++++++++++++--------- test/CodeGen/X86/dagcombine-buildvector.ll | 16 +++- 5 files changed, 114 insertions(+), 59 deletions(-) diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 163f4c5ae50..ef166a26c55 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -825,11 +825,11 @@ public: virtual bool isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) const; - /// isConsecutiveLoad - Return true if LD (which must be a LoadSDNode) is - /// loading 'Bytes' bytes from a location that is 'Dist' units away from the - /// location that the 'Base' load is loading from. - bool isConsecutiveLoad(SDNode *LD, SDNode *Base, unsigned Bytes, int Dist, - const MachineFrameInfo *MFI) const; + /// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a + /// location that is 'Dist' units away from the location that the 'Base' load + /// is loading from. + bool isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, + int Dist, const MachineFrameInfo *MFI) const; /// PerformDAGCombine - This method will be invoked for all target nodes and /// for any target-independent nodes that the target has registered with diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 5d1b2a3ea28..609ec82c5ad 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3626,30 +3626,29 @@ static SDNode *getBuildPairElt(SDNode *N, unsigned i) { SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, MVT VT) { assert(N->getOpcode() == ISD::BUILD_PAIR); - SDNode *LD1 = getBuildPairElt(N, 0); - if (!ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse()) + LoadSDNode *LD1 = dyn_cast(getBuildPairElt(N, 0)); + LoadSDNode *LD2 = dyn_cast(getBuildPairElt(N, 1)); + if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse()) return SDValue(); MVT LD1VT = LD1->getValueType(0); - SDNode *LD2 = getBuildPairElt(N, 1); const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() && // If both are volatile this would reduce the number of volatile loads. // If one is volatile it might be ok, but play conservative and bail out. - !cast(LD1)->isVolatile() && - !cast(LD2)->isVolatile() && + !LD1->isVolatile() && + !LD2->isVolatile() && TLI.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1, MFI)) { - LoadSDNode *LD = cast(LD1); - unsigned Align = LD->getAlignment(); + unsigned Align = LD1->getAlignment(); unsigned NewAlign = TLI.getTargetData()-> getABITypeAlignment(VT.getTypeForMVT()); if (NewAlign <= Align && (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) - return DAG.getLoad(VT, N->getDebugLoc(), LD->getChain(), LD->getBasePtr(), - LD->getSrcValue(), LD->getSrcValueOffset(), - false, Align); + return DAG.getLoad(VT, N->getDebugLoc(), LD1->getChain(), + LD1->getBasePtr(), LD1->getSrcValue(), + LD1->getSrcValueOffset(), false, Align); } return SDValue(); diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 3334e53f0fb..ab4cd515531 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2070,13 +2070,13 @@ bool TargetLowering::isGAPlusOffset(SDNode *N, GlobalValue* &GA, } -/// isConsecutiveLoad - Return true if LD (which must be a LoadSDNode) is -/// loading 'Bytes' bytes from a location that is 'Dist' units away from the -/// location that the 'Base' load is loading from. -bool TargetLowering::isConsecutiveLoad(SDNode *LD, SDNode *Base, - unsigned Bytes, int Dist, +/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a +/// location that is 'Dist' units away from the location that the 'Base' load +/// is loading from. +bool TargetLowering::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base, + unsigned Bytes, int Dist, const MachineFrameInfo *MFI) const { - if (LD->getOperand(0).getNode() != Base->getOperand(0).getNode()) + if (LD->getChain() != Base->getChain()) return false; MVT VT = LD->getValueType(0); if (VT.getSizeInBits() / 8 != Bytes) @@ -2094,6 +2094,11 @@ bool TargetLowering::isConsecutiveLoad(SDNode *LD, SDNode *Base, if (FS != BFS || FS != (int)Bytes) return false; return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } + if (Loc.getOpcode() == ISD::ADD && Loc.getOperand(0) == BaseLoc) { + ConstantSDNode *V = dyn_cast(Loc.getOperand(1)); + if (V && (V->getSExtValue() == Dist*Bytes)) + return true; + } GlobalValue *GV1 = NULL; GlobalValue *GV2 = NULL; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 924155c4505..77c9f3d02a6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7675,8 +7675,9 @@ static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, if (Elt.getOpcode() == ISD::UNDEF) continue; - if (!TLI.isConsecutiveLoad(Elt.getNode(), Base, - EVT.getSizeInBits()/8, i, MFI)) + LoadSDNode *LD = cast(Elt); + LoadSDNode *LDBase = cast(Base); + if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI)) return false; } return true; @@ -7751,44 +7752,82 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG, MVT VT = N->getValueType(0); MVT EVT = VT.getVectorElementType(); - if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit()) - // We are looking for load i64 and zero extend. We want to transform - // it before legalizer has a chance to expand it. Also look for i64 - // BUILD_PAIR bit casted to f64. - return SDValue(); - // This must be an insertion into a zero vector. - SDValue HighElt = N->getOperand(1); - if (!isZeroNode(HighElt)) - return SDValue(); + + // Before or during type legalization, we want to try and convert a + // build_vector of an i64 load and a zero value into vzext_movl before the + // legalizer can break it up. + // FIXME: does the case below remove the need to do this? + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) { + if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit()) + return SDValue(); + + // This must be an insertion into a zero vector. + SDValue HighElt = N->getOperand(1); + if (!isZeroNode(HighElt)) + return SDValue(); + + // Value must be a load. + SDNode *Base = N->getOperand(0).getNode(); + if (!isa(Base)) { + if (Base->getOpcode() != ISD::BIT_CONVERT) + return SDValue(); + Base = Base->getOperand(0).getNode(); + if (!isa(Base)) + return SDValue(); + } + + // Transform it into VZEXT_LOAD addr. + LoadSDNode *LD = cast(Base); + + // Load must not be an extload. + if (LD->getExtensionType() != ISD::NON_EXTLOAD) + return SDValue(); + + // Load type should legal type so we don't have to legalize it. + if (!TLI.isTypeLegal(VT)) + return SDValue(); + + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; + SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); + TargetLowering::TargetLoweringOpt TLO(DAG); + TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1)); + DCI.CommitTargetLoweringOpt(TLO); + return ResNode; + } + + // The type legalizer will have broken apart v2i64 build_vector created during + // widening before the code which handles that case is run. Look for build + // vector (load, load + 4, 0/undef, 0/undef) + if (VT == MVT::v4i32 || VT == MVT::v4f32) { + LoadSDNode *LD0 = dyn_cast(N->getOperand(0)); + LoadSDNode *LD1 = dyn_cast(N->getOperand(1)); + if (!LD0 || !LD1) + return SDValue(); + if (LD0->getExtensionType() != ISD::NON_EXTLOAD || + LD1->getExtensionType() != ISD::NON_EXTLOAD) + return SDValue(); + // Make sure the second elt is a consecutive load. + if (!TLI.isConsecutiveLoad(LD1, LD0, EVT.getSizeInBits()/8, 1, + DAG.getMachineFunction().getFrameInfo())) + return SDValue(); - // Value must be a load. - SDNode *Base = N->getOperand(0).getNode(); - if (!isa(Base)) { - if (Base->getOpcode() != ISD::BIT_CONVERT) + SDValue N2 = N->getOperand(2); + SDValue N3 = N->getOperand(3); + if (!isZeroNode(N2) && N2.getOpcode() != ISD::UNDEF) return SDValue(); - Base = Base->getOperand(0).getNode(); - if (!isa(Base)) + if (!isZeroNode(N3) && N3.getOpcode() != ISD::UNDEF) return SDValue(); + + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { LD0->getChain(), LD0->getBasePtr() }; + SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); + TargetLowering::TargetLoweringOpt TLO(DAG); + TLO.CombineTo(SDValue(LD0, 1), ResNode.getValue(1)); + DCI.CommitTargetLoweringOpt(TLO); + return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); } - - // Transform it into VZEXT_LOAD addr. - LoadSDNode *LD = cast(Base); - - // Load must not be an extload. - if (LD->getExtensionType() != ISD::NON_EXTLOAD) - return SDValue(); - - // Load type should legal type so we don't have to legalize it. - if (!TLI.isTypeLegal(VT)) - return SDValue(); - - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; - SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); - TargetLowering::TargetLoweringOpt TLO(DAG); - TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1)); - DCI.CommitTargetLoweringOpt(TLO); - return ResNode; + return SDValue(); } /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. diff --git a/test/CodeGen/X86/dagcombine-buildvector.ll b/test/CodeGen/X86/dagcombine-buildvector.ll index c89a296d0db..b96fdfc03c6 100644 --- a/test/CodeGen/X86/dagcombine-buildvector.ll +++ b/test/CodeGen/X86/dagcombine-buildvector.ll @@ -1,13 +1,25 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -o %t -f +; RUN: llvm-as < %s | llc -march=x86 -mcpu=penryn -disable-mmx -o %t -f ; RUN: grep unpcklpd %t | count 1 ; RUN: grep movapd %t | count 1 +; RUN: grep movaps %t | count 1 ; Shows a dag combine bug that will generate an illegal build vector ; with v2i64 build_vector i32, i32. -define void @test(<2 x double>* %dst, <4 x double> %src) { +define void @test(<2 x double>* %dst, <4 x double> %src) nounwind { entry: %tmp7.i = shufflevector <4 x double> %src, <4 x double> undef, <2 x i32> < i32 0, i32 2 > store <2 x double> %tmp7.i, <2 x double>* %dst ret void } + +define void @test2(<4 x i16>* %src, <4 x i32>* %dest) nounwind { +entry: + %tmp1 = load <4 x i16>* %src + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> + %0 = tail call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3) + store <4 x i32> %0, <4 x i32>* %dest + ret void +} + +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone -- 2.11.0