/// \brief Return if the target supports combining a
/// chain like:
/// \code
- /// %andResult = and %val1, #imm-with-one-bit-set;
+ /// %andResult = and %val1, #mask
/// %icmpResult = icmp %andResult, 0
- /// br i1 %icmpResult, label %dest1, label %dest2
/// \endcode
/// into a single machine instruction of a form like:
/// \code
- /// brOnBitSet %register, #bitNumber, dest
+ /// cc = test %register, #mask
/// \endcode
- bool isMaskAndBranchFoldingLegal() const {
- return MaskAndBranchFoldingIsLegal;
+ virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
+ return false;
}
/// Return true if the target should transform:
/// the branch is usually predicted right.
bool PredictableSelectIsExpensive;
- /// MaskAndBranchFoldingIsLegal - Indicates if the target supports folding
- /// a mask of a single bit, a compare, and a branch into a single instruction.
- bool MaskAndBranchFoldingIsLegal;
-
/// \see enableExtLdPromotion.
bool EnableExtLdPromotion;
STATISTIC(NumRetsDup, "Number of return instructions duplicated");
STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
-STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
static cl::opt<bool> DisableBranchOpts(
bool optimizeExtractElementInst(Instruction *Inst);
bool dupRetToEnableTailCallOpts(BasicBlock *BB);
bool placeDbgValues(Function &F);
- bool sinkAndCmp(Function &F);
bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
Instruction *&Inst,
const SmallVectorImpl<Instruction *> &Exts,
// find a node corresponding to the value.
EverMadeChange |= placeDbgValues(F);
- // If there is a mask, compare against zero, and branch that can be combined
- // into a single target instruction, push the mask and compare into branch
- // users. Do this before OptimizeBlock -> OptimizeInst ->
- // OptimizeCmpExpression, which perturbs the pattern being searched for.
- if (!DisableBranchOpts) {
- EverMadeChange |= sinkAndCmp(F);
+ if (!DisableBranchOpts)
EverMadeChange |= splitBranchCondition(F);
- }
bool MadeChange = true;
while (MadeChange) {
return false;
}
+/// Duplicate and sink the given 'and' instruction into user blocks where it is
+/// used in a compare to allow isel to generate better code for targets where
+/// this operation can be combined.
+///
+/// Return true if any changes are made.
+static bool sinkAndCmp0Expression(Instruction *AndI,
+ const TargetLowering &TLI,
+ SetOfInstrs &InsertedInsts) {
+ // Double-check that we're not trying to optimize an instruction that was
+ // already optimized by some other part of this pass.
+ assert(!InsertedInsts.count(AndI) &&
+ "Attempting to optimize already optimized and instruction");
+ (void) InsertedInsts;
+
+ // Nothing to do for single use in same basic block.
+ if (AndI->hasOneUse() &&
+ AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
+ return false;
+
+ // Try to avoid cases where sinking/duplicating is likely to increase register
+ // pressure.
+ if (!isa<ConstantInt>(AndI->getOperand(0)) &&
+ !isa<ConstantInt>(AndI->getOperand(1)) &&
+ AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
+ return false;
+
+ for (auto *U : AndI->users()) {
+ Instruction *User = cast<Instruction>(U);
+
+ // Only sink for and mask feeding icmp with 0.
+ if (!isa<ICmpInst>(User))
+ return false;
+
+ auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
+ if (!CmpC || !CmpC->isZero())
+ return false;
+ }
+
+ if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
+ return false;
+
+ DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
+ DEBUG(AndI->getParent()->dump());
+
+ // Push the 'and' into the same block as the icmp 0. There should only be
+ // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
+ // others, so we don't need to keep track of which BBs we insert into.
+ for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
+ UI != E; ) {
+ Use &TheUse = UI.getUse();
+ Instruction *User = cast<Instruction>(*UI);
+
+ // Preincrement use iterator so we don't invalidate it.
+ ++UI;
+
+ DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
+
+ // Keep the 'and' in the same place if the use is already in the same block.
+ Instruction *InsertPt =
+ User->getParent() == AndI->getParent() ? AndI : User;
+ Instruction *InsertedAnd =
+ BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
+ AndI->getOperand(1), "", InsertPt);
+ // Propagate the debug info.
+ InsertedAnd->setDebugLoc(AndI->getDebugLoc());
+
+ // Replace a use of the 'and' with a use of the new 'and'.
+ TheUse = InsertedAnd;
+ ++NumAndUses;
+ DEBUG(User->getParent()->dump());
+ }
+
+ // We removed all uses, nuke the and.
+ AndI->eraseFromParent();
+ return true;
+}
+
/// Check if the candidates could be combined with a shift instruction, which
/// includes:
/// 1. Truncate instruction
!(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
return false;
- // Skip loads we've already transformed or have no reason to transform.
- if (Load->hasOneUse()) {
- User *LoadUser = *Load->user_begin();
- if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
- !dyn_cast<PHINode>(LoadUser))
- return false;
- }
+ // Skip loads we've already transformed.
+ if (Load->hasOneUse() &&
+ InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
+ return false;
// Look at all uses of Load, looking through phis, to determine how many bits
// of the loaded value are needed.
IRBuilder<> Builder(Load->getNextNode());
auto *NewAnd = dyn_cast<Instruction>(
Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
+ // Mark this instruction as "inserted by CGP", so that other
+ // optimizations don't touch it.
+ InsertedInsts.insert(NewAnd);
// Replace all uses of load with new and (except for the use of load in the
// new and itself).
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
+ if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
+ EnableAndCmpSinking && TLI)
+ return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
+
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
BinOp->getOpcode() == Instruction::LShr)) {
ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
return MadeChange;
}
-// If there is a sequence that branches based on comparing a single bit
-// against zero that can be combined into a single instruction, and the
-// target supports folding these into a single instruction, sink the
-// mask and compare into the branch uses. Do this before OptimizeBlock ->
-// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being
-// searched for.
-bool CodeGenPrepare::sinkAndCmp(Function &F) {
- if (!EnableAndCmpSinking)
- return false;
- if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
- return false;
- bool MadeChange = false;
- for (BasicBlock &BB : F) {
- // Does this BB end with the following?
- // %andVal = and %val, #single-bit-set
- // %icmpVal = icmp %andResult, 0
- // br i1 %cmpVal label %dest1, label %dest2"
- BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator());
- if (!Brcc || !Brcc->isConditional())
- continue;
- ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
- if (!Cmp || Cmp->getParent() != &BB)
- continue;
- ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
- if (!Zero || !Zero->isZero())
- continue;
- Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
- if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB)
- continue;
- ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
- if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
- continue;
- DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump());
-
- // Push the "and; icmp" for any users that are conditional branches.
- // Since there can only be one branch use per BB, we don't need to keep
- // track of which BBs we insert into.
- for (Use &TheUse : Cmp->uses()) {
- // Find brcc use.
- BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse);
- if (!BrccUser || !BrccUser->isConditional())
- continue;
- BasicBlock *UserBB = BrccUser->getParent();
- if (UserBB == &BB) continue;
- DEBUG(dbgs() << "found Brcc use\n");
-
- // Sink the "and; icmp" to use.
- MadeChange = true;
- BinaryOperator *NewAnd =
- BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "",
- BrccUser);
- CmpInst *NewCmp =
- CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero,
- "", BrccUser);
- TheUse = NewCmp;
- ++NumAndCmpsMoved;
- DEBUG(BrccUser->getParent()->dump());
- }
- }
- return MadeChange;
-}
-
/// \brief Scale down both weights to fit into uint32_t.
static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
HasExtractBitsInsn = false;
JumpIsExpensive = JumpIsExpensiveOverride;
PredictableSelectIsExpensive = false;
- MaskAndBranchFoldingIsLegal = false;
EnableExtLdPromotion = false;
HasFloatingPointExceptions = true;
StackPointerRegisterToSaveRestore = 0;
setSchedulingPreference(Sched::Hybrid);
- // Enable TBZ/TBNZ
- MaskAndBranchFoldingIsLegal = true;
EnableExtLdPromotion = true;
// Set required alignment.
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
}
+bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
+ const Instruction &AndI) const {
+ // Only sink 'and' mask to cmp use block if it is masking a single bit, since
+ // this is likely to be fold the and/cmp/br into a single tbz instruction. It
+ // may be beneficial to sink in other cases, but we would have to check that
+ // the cmp would not get folded into the br to form a cbz for these to be
+ // beneficial.
+ ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+ if (!Mask)
+ return false;
+ return Mask->getUniqueInteger().isPowerOf2();
+}
+
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in AArch64unctionInfo.
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
return true;
}
+ bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
bool hasAndNotCompare(SDValue) const override {
// 'bics'
return true;
return Subtarget.hasFastLZCNT();
}
+bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
+ const Instruction &AndI) const {
+ return true;
+}
+
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
if (!Subtarget.hasBMI())
return false;
return false;
}
+ bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
bool hasAndNotCompare(SDValue Y) const override;
/// Return the value type to use for ISD::SETCC.
--- /dev/null
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s
+
+@A = global i32 zeroinitializer
+@B = global i32 zeroinitializer
+@C = global i32 zeroinitializer
+
+; Test that and is sunk into cmp block to form tbz.
+define i32 @and_sink1(i32 %a, i1 %c) {
+; CHECK-LABEL: and_sink1:
+; CHECK: tbz w1, #0
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A]
+; CHECK: tbnz {{w[0-9]+}}, #2
+
+; CHECK-CGP-LABEL: @and_sink1(
+; CHECK-CGP-NOT: and i32
+ %and = and i32 %a, 4
+ br i1 %c, label %bb0, label %bb2
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+; CHECK-CGP-NEXT: store
+; CHECK-CGP-NEXT: br
+ %cmp = icmp eq i32 %and, 0
+ store i32 0, i32* @A
+ br i1 %cmp, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 0
+}
+
+; Test that both 'and' and cmp get sunk to form tbz.
+define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
+; CHECK-LABEL: and_sink2:
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A]
+; CHECK: tbz w1, #0
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:B]
+; CHECK: tbz w2, #0
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:C]
+; CHECK: tbnz {{w[0-9]+}}, #2
+
+; CHECK-CGP-LABEL: @and_sink2(
+; CHECK-CGP-NOT: and i32
+ %and = and i32 %a, 4
+ store i32 0, i32* @A
+ br i1 %c, label %bb0, label %bb3
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP-NOT: icmp
+ %cmp = icmp eq i32 %and, 0
+ store i32 0, i32* @B
+ br i1 %c2, label %bb1, label %bb3
+bb1:
+; CHECK-CGP-LABEL: bb1:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+; CHECK-CGP-NEXT: store
+; CHECK-CGP-NEXT: br
+ store i32 0, i32* @C
+ br i1 %cmp, label %bb2, label %bb0
+bb2:
+ ret i32 1
+bb3:
+ ret i32 0
+}
+
+; Test that 'and' is not sunk since cbz is a better alternative.
+define i32 @and_sink3(i32 %a) {
+; CHECK-LABEL: and_sink3:
+; CHECK: and [[REG:w[0-9]+]], w0, #0x3
+; CHECK: [[LOOP:.L[A-Z0-9_]+]]:
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A]
+; CHECK: cbz [[REG]], [[LOOP]]
+
+; CHECK-CGP-LABEL: @and_sink3(
+; CHECK-CGP-NEXT: and i32
+ %and = and i32 %a, 3
+ br label %bb0
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+ %cmp = icmp eq i32 %and, 0
+ store i32 0, i32* @A
+ br i1 %cmp, label %bb0, label %bb2
+bb2:
+ ret i32 0
+}
; Test that we don't fold the 'and' instruction into the compare.
define i32 @icmp_eq_and_i32(i32 %a, i1 %c) {
; CHECK-LABEL: icmp_eq_and_i32
-; CHECK: and [[REG:w[0-9]+]], w0, #0x4
+; CHECK: and [[REG:w[0-9]+]], w0, #0x3
; CHECK-NEXT: cbz [[REG]], {{LBB.+_3}}
+ %1 = and i32 %a, 3
+ br i1 %c, label %bb0, label %bb2
+bb0:
+ %2 = icmp eq i32 %1, 0
+ br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+ ret i32 1
+bb2:
+ ret i32 0
+}
+
+; Test that we do fold the 'and' instruction into the compare and
+; generate a tbz instruction for the conditional branch.
+define i32 @icmp_eq_and1bit_i32(i32 %a, i1 %c) {
+; CHECK-LABEL: icmp_eq_and1bit_i32
+; CHECK: tbz {{w[0-9]+}}, #2, {{LBB.+_3}}
%1 = and i32 %a, 4
br i1 %c, label %bb0, label %bb2
bb0:
--- /dev/null
+; RUN: llc -mtriple=i686-unknown -verify-machineinstrs < %s | FileCheck %s
+; RUN: opt < %s -codegenprepare -S -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK-CGP %s
+
+@A = global i32 zeroinitializer
+@B = global i32 zeroinitializer
+@C = global i32 zeroinitializer
+
+; Test that 'and' is sunk into bb0.
+define i32 @and_sink1(i32 %a, i1 %c) {
+; CHECK-LABEL: and_sink1:
+; CHECK: testb $1,
+; CHECK: je
+; CHECK-NOT: andl $4,
+; CHECK: movl $0, A
+; CHECK: testb $4,
+; CHECK: jne
+
+; CHECK-CGP-LABEL: @and_sink1(
+; CHECK-CGP-NOT: and i32
+ %and = and i32 %a, 4
+ br i1 %c, label %bb0, label %bb2
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+; CHECK-CGP-NEXT: store
+; CHECK-CGP-NEXT: br
+ %cmp = icmp eq i32 %and, 0
+ store i32 0, i32* @A
+ br i1 %cmp, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 0
+}
+
+; Test that both 'and' and cmp get sunk to bb1.
+define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
+; CHECK-LABEL: and_sink2:
+; CHECK: movl $0, A
+; CHECK: testb $1,
+; CHECK: je
+; CHECK-NOT: andl $4,
+; CHECK: movl $0, B
+; CHECK: testb $1,
+; CHECK: je
+; CHECK: movl $0, C
+; CHECK: testb $4,
+; CHECK: jne
+
+; CHECK-CGP-LABEL: @and_sink2(
+; CHECK-CGP-NOT: and i32
+ %and = and i32 %a, 4
+ store i32 0, i32* @A
+ br i1 %c, label %bb0, label %bb3
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP-NOT: icmp
+ %cmp = icmp eq i32 %and, 0
+ store i32 0, i32* @B
+ br i1 %c2, label %bb1, label %bb3
+bb1:
+; CHECK-CGP-LABEL: bb1:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+; CHECK-CGP-NEXT: store
+; CHECK-CGP-NEXT: br
+ store i32 0, i32* @C
+ br i1 %cmp, label %bb2, label %bb0
+bb2:
+ ret i32 1
+bb3:
+ ret i32 0
+}
+
+; Test that CodeGenPrepare doesn't get stuck in a loop sinking and hoisting a masked load.
+define i32 @and_sink3(i1 %c, i32* %p) {
+; CHECK-LABEL: and_sink3:
+; CHECK: testb $1,
+; CHECK: je
+; CHECK: movzbl
+; CHECK: movl $0, A
+; CHECK: testl %
+; CHECK: je
+
+; CHECK-CGP-LABEL: @and_sink3(
+; CHECK-CGP: load i32
+; CHECK-CGP-NEXT: and i32
+ %load = load i32, i32* %p
+ %and = and i32 %load, 255
+ br i1 %c, label %bb0, label %bb2
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP: icmp eq i32
+ %cmp = icmp eq i32 %and, 0
+ store i32 0, i32* @A
+ br i1 %cmp, label %bb1, label %bb2
+bb1:
+ ret i32 1
+bb2:
+ ret i32 0
+}
+
+; Test that CodeGenPrepare sinks/duplicates non-immediate 'and'.
+define i32 @and_sink4(i32 %a, i32 %b, i1 %c) {
+; CHECK-LABEL: and_sink4:
+; CHECK: testb $1,
+; CHECK: je
+; CHECK-NOT: andl
+; CHECK: movl $0, A
+; CHECK: testl [[REG1:%[a-z0-9]+]], [[REG2:%[a-z0-9]+]]
+; CHECK: jne
+; CHECK: movl {{%[a-z0-9]+}}, B
+; CHECK: testl [[REG1]], [[REG2]]
+; CHECK: je
+
+; CHECK-CGP-LABEL: @and_sink4(
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP-NOT: icmp
+ %and = and i32 %a, %b
+ %cmp = icmp eq i32 %and, 0
+ br i1 %c, label %bb0, label %bb3
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+ store i32 0, i32* @A
+ br i1 %cmp, label %bb1, label %bb3
+bb1:
+; CHECK-CGP-LABEL: bb1:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+ %add = add i32 %a, %b
+ store i32 %add, i32* @B
+ br i1 %cmp, label %bb2, label %bb3
+bb2:
+ ret i32 1
+bb3:
+ ret i32 0
+}
+
+
+; Test that CodeGenPrepare doesn't sink/duplicate non-immediate 'and'
+; when it would increase register pressure.
+define i32 @and_sink5(i32 %a, i32 %b, i32 %a2, i32 %b2, i1 %c) {
+; CHECK-LABEL: and_sink5:
+; CHECK: testb $1,
+; CHECK: je
+; CHECK: andl {{[0-9]+\(%[a-z0-9]+\)}}, [[REG:%[a-z0-9]+]]
+; CHECK: movl $0, A
+; CHECK: jne
+; CHECK: movl {{%[a-z0-9]+}}, B
+; CHECK: testl [[REG]], [[REG]]
+; CHECK: je
+
+; CHECK-CGP-LABEL: @and_sink5(
+; CHECK-CGP: and i32
+; CHECK-CGP-NOT: icmp
+ %and = and i32 %a, %b
+ %cmp = icmp eq i32 %and, 0
+ br i1 %c, label %bb0, label %bb3
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP: icmp eq i32
+ store i32 0, i32* @A
+ br i1 %cmp, label %bb1, label %bb3
+bb1:
+; CHECK-CGP-LABEL: bb1:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP: icmp eq i32
+ %add = add i32 %a2, %b2
+ store i32 %add, i32* @B
+ br i1 %cmp, label %bb2, label %bb3
+bb2:
+ ret i32 1
+bb3:
+ ret i32 0
+}