From 2392ae7d7344674dc3d946e324342515f4771b90 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Thu, 15 Apr 2010 04:48:01 +0000 Subject: [PATCH] Implement rdar://7860110 (also in target/readme.txt) narrowing a load/or/and/store sequence into a narrower store when it is safe. Daniel tells me that clang will start producing this sort of thing with bitfields, and this does trigger a few dozen times on 176.gcc produced by llvm-gcc even now. This compiles code like CodeGen/X86/2009-05-28-DAGCombineCrash.ll into: movl %eax, 36(%rdi) instead of: movl $4294967295, %eax ## imm = 0xFFFFFFFF andq 32(%rdi), %rax shlq $32, %rcx addq %rax, %rcx movq %rcx, 32(%rdi) and each of the testcases into a single store. Each of them used to compile into craziness like this: _test4: movl $65535, %eax ## imm = 0xFFFF andl (%rdi), %eax shll $16, %esi addl %eax, %esi movl %esi, (%rdi) ret git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@101343 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 186 +++++++++++++++++++++++++++---- lib/Target/README.txt | 13 --- test/CodeGen/X86/store-narrow.ll | 81 ++++++++++++++ 3 files changed, 245 insertions(+), 35 deletions(-) create mode 100644 test/CodeGen/X86/store-narrow.ll diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f734917efe2..671c5077056 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -254,24 +254,28 @@ namespace { /// looking for a better chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); - /// getShiftAmountTy - Returns a type large enough to hold any valid - /// shift amount - before type legalization these can be huge. - EVT getShiftAmountTy() { - return LegalTypes ? TLI.getShiftAmountTy() : TLI.getPointerTy(); - } - -public: + public: DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) - : DAG(D), - TLI(D.getTargetLoweringInfo()), - Level(Unrestricted), - OptLevel(OL), - LegalOperations(false), - LegalTypes(false), - AA(A) {} + : DAG(D), TLI(D.getTargetLoweringInfo()), Level(Unrestricted), + OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {} /// Run - runs the dag combiner on all nodes in the work list void Run(CombineLevel AtLevel); + + SelectionDAG &getDAG() const { return DAG; } + + /// getShiftAmountTy - Returns a type large enough to hold any valid + /// shift amount - before type legalization these can be huge. + EVT getShiftAmountTy() { + return LegalTypes ? TLI.getShiftAmountTy() : TLI.getPointerTy(); + } + + /// isTypeLegal - This method returns true if we are running before type + /// legalization or if the specified VT is legal. + bool isTypeLegal(const EVT &VT) { + if (!LegalTypes) return true; + return TLI.isTypeLegal(VT); + } }; } @@ -3950,7 +3954,7 @@ SDValue DAGCombiner::visitBIT_CONVERT(SDNode *N) { VT.isInteger() && !VT.isVector()) { unsigned OrigXWidth = N0.getOperand(1).getValueType().getSizeInBits(); EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth); - if (TLI.isTypeLegal(IntXVT) || !LegalTypes) { + if (isTypeLegal(IntXVT)) { SDValue X = DAG.getNode(ISD::BIT_CONVERT, N0.getDebugLoc(), IntXVT, N0.getOperand(1)); AddToWorkList(X.getNode()); @@ -4465,7 +4469,7 @@ SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) { ConstantFPSDNode *N0CFP = dyn_cast(N0); // fold (fp_round_inreg c1fp) -> c1fp - if (N0CFP && (TLI.isTypeLegal(EVT) || !LegalTypes)) { + if (N0CFP && isTypeLegal(EVT)) { SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), EVT); return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, Round); } @@ -5146,6 +5150,123 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { return SDValue(); } +/// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the +/// load is having specific bytes cleared out. If so, return the byte size +/// being masked out and the shift amount. +static std::pair +CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { + std::pair Result(0, 0); + + // Check for the structure we're looking for. + if (V->getOpcode() != ISD::AND || + !isa(V->getOperand(1)) || + !ISD::isNormalLoad(V->getOperand(0).getNode())) + return Result; + + // Check the chain and pointer. The store should be chained directly to the + // load (TODO: Or through a TF node!) since it's to the same address. + LoadSDNode *LD = cast(V->getOperand(0)); + if (LD->getBasePtr() != Ptr || + V->getOperand(0).getNode() != Chain.getNode()) + return Result; + + // This only handles simple types. + if (V.getValueType() != MVT::i16 && + V.getValueType() != MVT::i32 && + V.getValueType() != MVT::i64) + return Result; + + // Check the constant mask. Invert it so that the bits being masked out are + // 0 and the bits being kept are 1. Use getSExtValue so that leading bits + // follow the sign bit for uniformity. + uint64_t NotMask = ~cast(V->getOperand(1))->getSExtValue(); + unsigned NotMaskLZ = CountLeadingZeros_64(NotMask); + if (NotMaskLZ & 7) return Result; // Must be multiple of a byte. + unsigned NotMaskTZ = CountTrailingZeros_64(NotMask); + if (NotMaskTZ & 7) return Result; // Must be multiple of a byte. + if (NotMaskLZ == 64) return Result; // All zero mask. + + // See if we have a continuous run of bits. If so, we have 0*1+0* + if (CountTrailingOnes_64(NotMask >> NotMaskTZ)+NotMaskTZ+NotMaskLZ != 64) + return Result; + + // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. + if (V.getValueType() != MVT::i64 && NotMaskLZ) + NotMaskLZ -= 64-V.getValueSizeInBits(); + + unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8; + switch (MaskedBytes) { + case 1: + case 2: + case 4: break; + default: return Result; // All one mask, or 5-byte mask. + } + + // Verify that the first bit starts at a multiple of mask so that the access + // is aligned the same as the access width. + if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; + + Result.first = MaskedBytes; + Result.second = NotMaskTZ/8; + return Result; +} + + +/// ShrinkLoadReplaceStoreWithStore - Check to see if IVal is something that +/// provides a value as specified by MaskInfo. If so, replace the specified +/// store with a narrower store of truncated IVal. +static SDNode * +ShrinkLoadReplaceStoreWithStore(const std::pair &MaskInfo, + SDValue IVal, StoreSDNode *St, + DAGCombiner *DC) { + unsigned NumBytes = MaskInfo.first; + unsigned ByteShift = MaskInfo.second; + SelectionDAG &DAG = DC->getDAG(); + + // Check to see if IVal is all zeros in the part being masked in by the 'or' + // that uses this. If not, this is not a replacement. + APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(), + ByteShift*8, (ByteShift+NumBytes)*8); + if (!DAG.MaskedValueIsZero(IVal, Mask)) return 0; + + // Check that it is legal on the target to do this. It is legal if the new + // VT we're shrinking to (i8/i16/i32) is legal or we're still before type + // legalization. + MVT VT = MVT::getIntegerVT(NumBytes*8); + if (!DC->isTypeLegal(VT)) + return 0; + + // Okay, we can do this! Replace the 'St' store with a store of IVal that is + // shifted by ByteShift and truncated down to NumBytes. + if (ByteShift) + IVal = DAG.getNode(ISD::SRL, IVal->getDebugLoc(), IVal.getValueType(), IVal, + DAG.getConstant(ByteShift*8, DC->getShiftAmountTy())); + + // Figure out the offset for the store and the alignment of the access. + unsigned StOffset; + unsigned NewAlign = St->getAlignment(); + + if (DAG.getTargetLoweringInfo().isLittleEndian()) + StOffset = ByteShift; + else + StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes; + + SDValue Ptr = St->getBasePtr(); + if (StOffset) { + Ptr = DAG.getNode(ISD::ADD, IVal->getDebugLoc(), Ptr.getValueType(), + Ptr, DAG.getConstant(StOffset, Ptr.getValueType())); + NewAlign = MinAlign(NewAlign, StOffset); + } + + // Truncate down to the new size. + IVal = DAG.getNode(ISD::TRUNCATE, IVal->getDebugLoc(), VT, IVal); + + ++OpsNarrowed; + return DAG.getStore(St->getChain(), St->getDebugLoc(), IVal, Ptr, + St->getSrcValue(), St->getSrcValueOffset()+StOffset, + false, false, NewAlign).getNode(); +} + /// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is /// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some @@ -5165,6 +5286,28 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { return SDValue(); unsigned Opc = Value.getOpcode(); + + // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst + // is a byte mask indicating a consecutive number of bytes, check to see if + // Y is known to provide just those bytes. If so, we try to replace the + // load + replace + store sequence with a single (narrower) store, which makes + // the load dead. + if (Opc == ISD::OR) { + std::pair MaskedLoad; + MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain); + if (MaskedLoad.first) + if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, + Value.getOperand(1), ST,this)) + return SDValue(NewST, 0); + + // Or is commutative, so try swapping X and Y. + MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain); + if (MaskedLoad.first) + if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, + Value.getOperand(0), ST,this)) + return SDValue(NewST, 0); + } + if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) || Value.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); @@ -5212,8 +5355,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff); - if (NewAlign < - TLI.getTargetData()->getABITypeAlignment(NewVT.getTypeForEVT(*DAG.getContext()))) + const Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext()); + if (NewAlign < TLI.getTargetData()->getABITypeAlignment(NewVTTy)) return SDValue(); SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(), @@ -5283,8 +5426,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { case MVT::ppcf128: break; case MVT::f32: - if (((TLI.isTypeLegal(MVT::i32) || !LegalTypes) && !LegalOperations && - !ST->isVolatile()) || + if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). bitcastToAPInt().getZExtValue(), MVT::i32); @@ -5295,7 +5437,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } break; case MVT::f64: - if (((TLI.isTypeLegal(MVT::i64) || !LegalTypes) && !LegalOperations && + if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). @@ -5660,7 +5802,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { } // Add count and size info. - if (!TLI.isTypeLegal(VT) && LegalTypes) + if (!isTypeLegal(VT)) return SDValue(); // Return the new VECTOR_SHUFFLE node. diff --git a/lib/Target/README.txt b/lib/Target/README.txt index 052a5756da9..150143bec7f 100644 --- a/lib/Target/README.txt +++ b/lib/Target/README.txt @@ -263,19 +263,6 @@ if anyone cared enough about sincos. //===---------------------------------------------------------------------===// -Turn this into a single byte store with no load (the other 3 bytes are -unmodified): - -define void @test(i32* %P) { - %tmp = load i32* %P - %tmp14 = or i32 %tmp, 3305111552 - %tmp15 = and i32 %tmp14, 3321888767 - store i32 %tmp15, i32* %P - ret void -} - -//===---------------------------------------------------------------------===// - quantum_sigma_x in 462.libquantum contains the following loop: for(i=0; isize; i++) diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll new file mode 100644 index 00000000000..23aa6164778 --- /dev/null +++ b/test/CodeGen/X86/store-narrow.ll @@ -0,0 +1,81 @@ +; rdar://7860110 +; RUN: llc < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-apple-darwin10.2" + +define void @test1(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp { +entry: + %A = load i32* %a0, align 4 + %B = and i32 %A, -256 ; 0xFFFFFF00 + %C = zext i8 %a1 to i32 + %D = or i32 %C, %B + store i32 %D, i32* %a0, align 4 + ret void + +; CHECK: test1: +; CHECK: movb %sil, (%rdi) +} + +define void @test2(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp { +entry: + %A = load i32* %a0, align 4 + %B = and i32 %A, -65281 ; 0xFFFF00FF + %C = zext i8 %a1 to i32 + %CS = shl i32 %C, 8 + %D = or i32 %B, %CS + store i32 %D, i32* %a0, align 4 + ret void +; CHECK: test2: +; CHECK: movb %sil, 1(%rdi) +} + +define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp { +entry: + %A = load i32* %a0, align 4 + %B = and i32 %A, -65536 ; 0xFFFF0000 + %C = zext i16 %a1 to i32 + %D = or i32 %B, %C + store i32 %D, i32* %a0, align 4 + ret void +; CHECK: test3: +; CHECK: movw %si, (%rdi) +} + +define void @test4(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp { +entry: + %A = load i32* %a0, align 4 + %B = and i32 %A, 65535 ; 0x0000FFFF + %C = zext i16 %a1 to i32 + %CS = shl i32 %C, 16 + %D = or i32 %B, %CS + store i32 %D, i32* %a0, align 4 + ret void +; CHECK: test4: +; CHECK: movw %si, 2(%rdi) +} + +define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp { +entry: + %A = load i64* %a0, align 4 + %B = and i64 %A, -4294901761 ; 0xFFFFFFFF0000FFFF + %C = zext i16 %a1 to i64 + %CS = shl i64 %C, 16 + %D = or i64 %B, %CS + store i64 %D, i64* %a0, align 4 + ret void +; CHECK: test5: +; CHECK: movw %si, 2(%rdi) +} + +define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp { +entry: + %A = load i64* %a0, align 4 + %B = and i64 %A, -280375465082881 ; 0xFFFF00FFFFFFFFFF + %C = zext i8 %a1 to i64 + %CS = shl i64 %C, 40 + %D = or i64 %B, %CS + store i64 %D, i64* %a0, align 4 + ret void +; CHECK: test6: +; CHECK: movb %sil, 5(%rdi) +} -- 2.11.0