From ba60b2bba26cb059cf2618b30e9115691ecef5ec Mon Sep 17 00:00:00 2001 From: Nikolai Bozhenov Date: Thu, 2 Mar 2017 22:12:15 +0000 Subject: [PATCH] [BypassSlowDivision] Use ValueTracking to simplify run-time checks ValueTracking is used for more thorough analysis of operands. Based on the analysis, either run-time checks can be simplified (e.g. check only one operand instead of two) or the transformation can be avoided. For example, it is quite often the case that a divisor is promoted from a shorter type and run-time checks for it are redundant. With additional compile-time analysis of values, two special cases naturally arise and are addressed by the patch: 1) Both operands are known to be short enough. Then, the long division can be simply replaced with a short one without CFG modification. 2) If a division is unsigned and the dividend is known to be short then the long division is not needed at all. Because if the divisor is too big for short division then the quotient is obviously zero (and the remainder is equal to the dividend). Actually, the division is not needed when (divisor > dividend). Differential Revision: https://reviews.llvm.org/D29897 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296832 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/BypassSlowDivision.cpp | 137 ++++++++++++++++----- .../NVPTX/bypass-slow-div-special-cases.ll | 95 ++++++++++++++ 2 files changed, 203 insertions(+), 29 deletions(-) create mode 100644 test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp index ed663de0199..9d8cb3187ee 100644 --- a/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -83,17 +84,28 @@ namespace llvm { } namespace { +enum ValueRange { + /// Operand definitely fits into BypassType. No runtime checks are needed. + VALRNG_SHORT, + /// A runtime check is required, as value range is unknown. + VALRNG_UNKNOWN, + /// Operand is unlikely to fit into BypassType. The bypassing should be + /// disabled. + VALRNG_LONG +}; + class FastDivInsertionTask { bool IsValidTask = false; Instruction *SlowDivOrRem = nullptr; IntegerType *BypassType = nullptr; BasicBlock *MainBB = nullptr; + ValueRange getValueRange(Value *Op); QuotRemWithBB createSlowBB(BasicBlock *Successor); QuotRemWithBB createFastBB(BasicBlock *Successor); QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, BasicBlock *PhiBB); - Value *insertOperandRuntimeCheck(); + Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); Optional insertFastDivAndRem(); bool isSignedOp() { @@ -175,6 +187,28 @@ Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { return isDivisionOp() ? Value.Quotient : Value.Remainder; } +/// Check if an integer value fits into our bypass type. +ValueRange FastDivInsertionTask::getValueRange(Value *V) { + unsigned ShortLen = BypassType->getBitWidth(); + unsigned LongLen = V->getType()->getIntegerBitWidth(); + + assert(LongLen > ShortLen && "Value type must be wider than BypassType"); + unsigned HiBits = LongLen - ShortLen; + + const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); + APInt Zeros(LongLen, 0), Ones(LongLen, 0); + + computeKnownBits(V, Zeros, Ones, DL); + + if (Zeros.countLeadingOnes() >= HiBits) + return VALRNG_SHORT; + + if (Ones.countLeadingZeros() < HiBits) + return VALRNG_LONG; + + return VALRNG_UNKNOWN; +} + /// Add new basic block for slow div and rem operations and put it before /// SuccessorBB. QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { @@ -241,22 +275,17 @@ QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, /// Creates a runtime check to test whether both the divisor and dividend fit /// into BypassType. The check is inserted at the end of MainBB. True return -/// value means that the operands fit. -Value *FastDivInsertionTask::insertOperandRuntimeCheck() { +/// value means that the operands fit. Either of the operands may be NULL if it +/// doesn't need a runtime check. +Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { + assert((Op1 || Op2) && "Nothing to check"); IRBuilder<> Builder(MainBB, MainBB->end()); - Value *Dividend = SlowDivOrRem->getOperand(0); - Value *Divisor = SlowDivOrRem->getOperand(1); - - // We should have bailed out above if the divisor is a constant, but the - // dividend may still be a constant. Set OrV to our non-constant operands - // OR'ed together. - assert(!isa(Divisor)); Value *OrV; - if (!isa(Dividend)) - OrV = Builder.CreateOr(Dividend, Divisor); + if (Op1 && Op2) + OrV = Builder.CreateOr(Op1, Op2); else - OrV = Divisor; + OrV = Op1 ? Op1 : Op2; // BitMask is inverted to check if the operands are // larger than the bypass type @@ -279,22 +308,72 @@ Optional FastDivInsertionTask::insertFastDivAndRem() { return None; } - // If the numerator is a constant, bail if it doesn't fit into BypassType. - if (ConstantInt *ConstDividend = dyn_cast(Dividend)) - if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth()) - return None; - - // Split the basic block before the div/rem. - BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); - // Remove the unconditional branch from MainBB to SuccessorBB. - MainBB->getInstList().back().eraseFromParent(); - QuotRemWithBB Fast = createFastBB(SuccessorBB); - QuotRemWithBB Slow = createSlowBB(SuccessorBB); - QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); - Value *CmpV = insertOperandRuntimeCheck(); - IRBuilder<> Builder(MainBB, MainBB->end()); - Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); - return Result; + ValueRange DividendRange = getValueRange(Dividend); + if (DividendRange == VALRNG_LONG) + return None; + + ValueRange DivisorRange = getValueRange(Divisor); + if (DivisorRange == VALRNG_LONG) + return None; + + bool DividendShort = (DividendRange == VALRNG_SHORT); + bool DivisorShort = (DivisorRange == VALRNG_SHORT); + + if (DividendShort && DivisorShort) { + // If both operands are known to be short then just replace the long + // division with a short one in-place. + + IRBuilder<> Builder(SlowDivOrRem); + Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); + Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); + Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); + Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); + Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); + Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); + return QuotRemPair(ExtDiv, ExtRem); + } else if (DividendShort && !isSignedOp()) { + // If the division is unsigned and Dividend is known to be short, then + // either + // 1) Divisor is less or equal to Dividend, and the result can be computed + // with a short division. + // 2) Divisor is greater than Dividend. In this case, no division is needed + // at all: The quotient is 0 and the remainder is equal to Dividend. + // + // So instead of checking at runtime whether Divisor fits into BypassType, + // we emit a runtime check to differentiate between these two cases. This + // lets us entirely avoid a long div. + + // Split the basic block before the div/rem. + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); + // Remove the unconditional branch from MainBB to SuccessorBB. + MainBB->getInstList().back().eraseFromParent(); + QuotRemWithBB Long; + Long.BB = MainBB; + Long.Quotient = ConstantInt::get(getSlowType(), 0); + Long.Remainder = Dividend; + QuotRemWithBB Fast = createFastBB(SuccessorBB); + QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); + IRBuilder<> Builder(MainBB, MainBB->end()); + Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); + Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); + return Result; + } else { + // General case. Create both slow and fast div/rem pairs and choose one of + // them at runtime. + + // Split the basic block before the div/rem. + BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); + // Remove the unconditional branch from MainBB to SuccessorBB. + MainBB->getInstList().back().eraseFromParent(); + QuotRemWithBB Fast = createFastBB(SuccessorBB); + QuotRemWithBB Slow = createSlowBB(SuccessorBB); + QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); + Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, + DivisorShort ? nullptr : Divisor); + IRBuilder<> Builder(MainBB, MainBB->end()); + Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); + return Result; + } } /// This optimization identifies DIV/REM instructions in a BB that can be diff --git a/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll b/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll new file mode 100644 index 00000000000..c820ad72f1f --- /dev/null +++ b/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -codegenprepare < %s | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; No bypassing should be done in apparently unsuitable cases. +define void @Test_no_bypassing(i32 %a, i64 %b, i64* %retptr) { +; CHECK-LABEL: @Test_no_bypassing( +; CHECK-NEXT: [[A_1:%.*]] = zext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[A_2:%.*]] = sub i64 -1, [[A_1]] +; CHECK-NEXT: [[RES:%.*]] = srem i64 [[A_2]], [[B:%.*]] +; CHECK-NEXT: store i64 [[RES]], i64* [[RETPTR:%.*]] +; CHECK-NEXT: ret void +; + %a.1 = zext i32 %a to i64 + ; %a.2 is always negative so the division cannot be bypassed. + %a.2 = sub i64 -1, %a.1 + %res = srem i64 %a.2, %b + store i64 %res, i64* %retptr + ret void +} + +; No OR instruction is needed if one of the operands (divisor) is known +; to fit into 32 bits. +define void @Test_check_one_operand(i64 %a, i32 %b, i64* %retptr) { +; CHECK-LABEL: @Test_check_one_operand( +; CHECK-NEXT: [[B_1:%.*]] = zext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[A:%.*]], -4294967296 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]] +; CHECK: [[TMP4:%.*]] = trunc i64 [[B_1]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[A]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: br label [[TMP10:%.*]] +; CHECK: [[TMP9:%.*]] = sdiv i64 [[A]], [[B_1]] +; CHECK-NEXT: br label [[TMP10]] +; CHECK: [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ] +; CHECK-NEXT: store i64 [[TMP11]], i64* [[RETPTR:%.*]] +; CHECK-NEXT: ret void +; + %b.1 = zext i32 %b to i64 + %res = sdiv i64 %a, %b.1 + store i64 %res, i64* %retptr + ret void +} + +; If both operands are known to fit into 32 bits, then replace the division +; in-place without CFG modification. +define void @Test_check_none(i64 %a, i32 %b, i64* %retptr) { +; CHECK-LABEL: @Test_check_none( +; CHECK-NEXT: [[A_1:%.*]] = and i64 [[A:%.*]], 4294967295 +; CHECK-NEXT: [[B_1:%.*]] = zext i32 [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[A_1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[B_1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = udiv i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[TMP4]], i64* [[RETPTR:%.*]] +; CHECK-NEXT: ret void +; + %a.1 = and i64 %a, 4294967295 + %b.1 = zext i32 %b to i64 + %res = udiv i64 %a.1, %b.1 + store i64 %res, i64* %retptr + ret void +} + +; In case of unsigned long division with a short dividend, +; the long division is not needed any more. +define void @Test_special_case(i32 %a, i64 %b, i64* %retptr) { +; CHECK-LABEL: @Test_special_case( +; CHECK-NEXT: [[A_1:%.*]] = zext i32 [[A:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = icmp uge i64 [[A_1]], [[B:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP9:%.*]] +; CHECK: [[TMP3:%.*]] = trunc i64 [[B]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[A_1]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = urem i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: br label [[TMP9]] +; CHECK: [[TMP10:%.*]] = phi i64 [ [[TMP7]], [[TMP2]] ], [ 0, [[TMP0:%.*]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP2]] ], [ [[A_1]], [[TMP0]] ] +; CHECK-NEXT: [[RES:%.*]] = add i64 [[TMP10]], [[TMP11]] +; CHECK-NEXT: store i64 [[RES]], i64* [[RETPTR:%.*]] +; CHECK-NEXT: ret void +; + %a.1 = zext i32 %a to i64 + %div = udiv i64 %a.1, %b + %rem = urem i64 %a.1, %b + %res = add i64 %div, %rem + store i64 %res, i64* %retptr + ret void +} -- 2.11.0