[InstCombine] avoid creating an extra instruction in zext fold and possible inf-loop

[android-x86/external-llvm-project.git] / llvm / lib / Transforms / InstCombine / InstCombineCasts.cpp
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp

index c68f9e8..07e68c4 100644 (file)
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -14,10 +14,11 @@
  #include "llvm/ADT/SetVector.h"
  #include "llvm/Analysis/ConstantFolding.h"
  #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
  #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
  #include "llvm/IR/PatternMatch.h"
  #include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
  #include <numeric>
  using namespace llvm;
  using namespace PatternMatch;
@@ -81,8 +82,8 @@ static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
  
  /// If we find a cast of an allocation instruction, try to eliminate the cast by
  /// moving the type information into the alloc.
-Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
-                                                   AllocaInst &AI) {
+Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
+                                                       AllocaInst &AI) {
    PointerType *PTy = cast<PointerType>(CI.getType());
  
    IRBuilderBase::InsertPointGuard Guard(Builder);
@@ -93,8 +94,20 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
    Type *CastElTy = PTy->getElementType();
    if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
  
-  unsigned AllocElTyAlign = DL.getABITypeAlignment(AllocElTy);
-  unsigned CastElTyAlign = DL.getABITypeAlignment(CastElTy);
+  // This optimisation does not work for cases where the cast type
+  // is scalable and the allocated type is not. This because we need to
+  // know how many times the casted type fits into the allocated type.
+  // For the opposite case where the allocated type is scalable and the
+  // cast type is not this leads to poor code quality due to the
+  // introduction of 'vscale' into the calculations. It seems better to
+  // bail out for this case too until we've done a proper cost-benefit
+  // analysis.
+  bool AllocIsScalable = isa<ScalableVectorType>(AllocElTy);
+  bool CastIsScalable = isa<ScalableVectorType>(CastElTy);
+  if (AllocIsScalable != CastIsScalable) return nullptr;
+
+  Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
+  Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
    if (CastElTyAlign < AllocElTyAlign) return nullptr;
  
    // If the allocation has multiple uses, only promote it if we are strictly
@@ -102,14 +115,15 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
    // same, we open the door to infinite loops of various kinds.
    if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
  
-  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy);
-  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy);
+  // The alloc and cast types should be either both fixed or both scalable.
+  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getKnownMinSize();
+  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getKnownMinSize();
    if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
  
    // If the allocation has multiple uses, only promote it if we're not
    // shrinking the amount of memory being allocated.
-  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy);
-  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy);
+  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy).getKnownMinSize();
+  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getKnownMinSize();
    if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
  
    // See if we can satisfy the modulus by pulling a scale out of the array
@@ -124,6 +138,9 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
    if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
        (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return nullptr;
  
+  // We don't currently support arrays of scalable types.
+  assert(!AllocIsScalable || (ArrayOffset == 1 && ArraySizeScale == 0));
+
    unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
    Value *Amt = nullptr;
    if (Scale == 1) {
@@ -160,8 +177,8 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
  
  /// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
  /// true for, actually insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
-                                             bool isSigned) {
+Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
+                                                 bool isSigned) {
    if (Constant *C = dyn_cast<Constant>(V)) {
      C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
      // If we got a constantexpr back, try to simplify it with DL info.
@@ -229,8 +246,9 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
    return InsertNewInstWith(Res, *I);
  }
  
-Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
-                                                        const CastInst *CI2) {
+Instruction::CastOps
+InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
+                                       const CastInst *CI2) {
    Type *SrcTy = CI1->getSrcTy();
    Type *MidTy = CI1->getDestTy();
    Type *DstTy = CI2->getDestTy();
@@ -257,7 +275,7 @@ Instruction::CastOps InstCombiner::isEliminableCastPair(const CastInst *CI1,
  }
  
  /// Implement the transforms common to all CastInst visitors.
-Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
    Value *Src = CI.getOperand(0);
  
    // Try to eliminate a cast of a cast.
@@ -342,7 +360,7 @@ static bool canNotEvaluateInType(Value *V, Type *Ty) {
  ///
  /// This function works on both vectors and scalars.
  ///
-static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
+static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
                                   Instruction *CxtI) {
    if (canAlwaysEvaluateInType(V, Ty))
      return true;
@@ -377,29 +395,31 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
      break;
    }
    case Instruction::Shl: {
-    // If we are truncating the result of this SHL, and if it's a shift of a
-    // constant amount, we can always perform a SHL in a smaller type.
-    const APInt *Amt;
-    if (match(I->getOperand(1), m_APInt(Amt))) {
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (Amt->getLimitedValue(BitWidth) < BitWidth)
-        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
-    }
+    // If we are truncating the result of this SHL, and if it's a shift of an
+    // inrange amount, we can always perform a SHL in a smaller type.
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    if (AmtKnownBits.getMaxValue().ult(BitWidth))
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
      break;
    }
    case Instruction::LShr: {
      // If this is a truncate of a logical shr, we can truncate it to a smaller
      // lshr iff we know that the bits we would otherwise be shifting in are
      // already zeros.
-    const APInt *Amt;
-    if (match(I->getOperand(1), m_APInt(Amt))) {
-      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (Amt->getLimitedValue(BitWidth) < BitWidth &&
-          IC.MaskedValueIsZero(I->getOperand(0),
-            APInt::getBitsSetFrom(OrigBitWidth, BitWidth), 0, CxtI)) {
-        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
-      }
+    // TODO: It is enough to check that the bits we would be shifting in are
+    //       zero - use AmtKnownBits.getMaxValue().
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+    if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
+        IC.MaskedValueIsZero(I->getOperand(0), ShiftedBits, 0, CxtI)) {
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
      }
      break;
    }
@@ -409,15 +429,15 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
      // original type and the sign bit of the truncate type are similar.
      // TODO: It is enough to check that the bits we would be shifting in are
      //       similar to sign bit of the truncate type.
-    const APInt *Amt;
-    if (match(I->getOperand(1), m_APInt(Amt))) {
-      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (Amt->getLimitedValue(BitWidth) < BitWidth &&
-          OrigBitWidth - BitWidth <
-              IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI))
-        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
-    }
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    unsigned ShiftedBits = OrigBitWidth - BitWidth;
+    if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
+        ShiftedBits < IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI))
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
      break;
    }
    case Instruction::Trunc:
@@ -457,7 +477,8 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
  ///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
  ///   --->
  ///   extractelement <4 x i32> %X, 1
-static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
+static Instruction *foldVecTruncToExtElt(TruncInst &Trunc,
+                                         InstCombinerImpl &IC) {
    Value *TruncOp = Trunc.getOperand(0);
    Type *DestType = Trunc.getType();
    if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
@@ -483,7 +504,7 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
    // bitcast it to a vector type that we can extract from.
    unsigned NumVecElts = VecWidth / DestWidth;
    if (VecType->getElementType() != DestType) {
-    VecType = VectorType::get(DestType, NumVecElts);
+    VecType = FixedVectorType::get(DestType, NumVecElts);
      VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc");
    }
  
@@ -494,9 +515,9 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
    return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt));
  }
  
-/// Rotate left/right may occur in a wider type than necessary because of type
-/// promotion rules. Try to narrow the inputs and convert to funnel shift.
-Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
+/// Funnel/Rotate left/right may occur in a wider type than necessary because of
+/// type promotion rules. Try to narrow the inputs and convert to funnel shift.
+Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
    assert((isa<VectorType>(Trunc.getSrcTy()) ||
            shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) &&
           "Don't narrow to an illegal scalar type");
@@ -508,32 +529,43 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
    if (!isPowerOf2_32(NarrowWidth))
      return nullptr;
  
-  // First, find an or'd pair of opposite shifts with the same shifted operand:
-  // trunc (or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1))
-  Value *Or0, *Or1;
-  if (!match(Trunc.getOperand(0), m_OneUse(m_Or(m_Value(Or0), m_Value(Or1)))))
+  // First, find an or'd pair of opposite shifts:
+  // trunc (or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1))
+  BinaryOperator *Or0, *Or1;
+  if (!match(Trunc.getOperand(0), m_OneUse(m_Or(m_BinOp(Or0), m_BinOp(Or1)))))
      return nullptr;
  
-  Value *ShVal, *ShAmt0, *ShAmt1;
-  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal), m_Value(ShAmt0)))) ||
-      !match(Or1, m_OneUse(m_LogicalShift(m_Specific(ShVal), m_Value(ShAmt1)))))
+  Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
+  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
+      !match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
+      Or0->getOpcode() == Or1->getOpcode())
      return nullptr;
  
-  auto ShiftOpcode0 = cast<BinaryOperator>(Or0)->getOpcode();
-  auto ShiftOpcode1 = cast<BinaryOperator>(Or1)->getOpcode();
-  if (ShiftOpcode0 == ShiftOpcode1)
-    return nullptr;
+  // Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
+  if (Or0->getOpcode() == BinaryOperator::LShr) {
+    std::swap(Or0, Or1);
+    std::swap(ShVal0, ShVal1);
+    std::swap(ShAmt0, ShAmt1);
+  }
+  assert(Or0->getOpcode() == BinaryOperator::Shl &&
+         Or1->getOpcode() == BinaryOperator::LShr &&
+         "Illegal or(shift,shift) pair");
  
-  // Match the shift amount operands for a rotate pattern. This always matches
-  // a subtraction on the R operand.
-  auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * {
+  // Match the shift amount operands for a funnel/rotate pattern. This always
+  // matches a subtraction on the R operand.
+  auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
      // The shift amounts may add up to the narrow bit width:
-    // (shl ShVal, L) | (lshr ShVal, Width - L)
+    // (shl ShVal0, L) | (lshr ShVal1, Width - L)
      if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L)))))
        return L;
  
+    // The following patterns currently only work for rotation patterns.
+    // TODO: Add more general funnel-shift compatible patterns.
+    if (ShVal0 != ShVal1)
+      return nullptr;
+
      // The shift amount may be masked with negation:
-    // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
+    // (shl ShVal0, (X & (Width - 1))) | (lshr ShVal1, ((-X) & (Width - 1)))
      Value *X;
      unsigned Mask = Width - 1;
      if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
@@ -549,10 +581,10 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
    };
  
    Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth);
-  bool SubIsOnLHS = false;
+  bool IsFshl = true; // Sub on LSHR.
    if (!ShAmt) {
      ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth);
-    SubIsOnLHS = true;
+    IsFshl = false; // Sub on SHL.
    }
    if (!ShAmt)
      return nullptr;
@@ -561,26 +593,28 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) {
    // will be a zext, but it could also be the result of an 'and' or 'shift'.
    unsigned WideWidth = Trunc.getSrcTy()->getScalarSizeInBits();
    APInt HiBitMask = APInt::getHighBitsSet(WideWidth, WideWidth - NarrowWidth);
-  if (!MaskedValueIsZero(ShVal, HiBitMask, 0, &Trunc))
+  if (!MaskedValueIsZero(ShVal0, HiBitMask, 0, &Trunc) ||
+      !MaskedValueIsZero(ShVal1, HiBitMask, 0, &Trunc))
      return nullptr;
  
    // We have an unnecessarily wide rotate!
-  // trunc (or (lshr ShVal, ShAmt), (shl ShVal, BitWidth - ShAmt))
+  // trunc (or (lshr ShVal0, ShAmt), (shl ShVal1, BitWidth - ShAmt))
    // Narrow the inputs and convert to funnel shift intrinsic:
    // llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt))
    Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy);
-  Value *X = Builder.CreateTrunc(ShVal, DestTy);
-  bool IsFshl = (!SubIsOnLHS && ShiftOpcode0 == BinaryOperator::Shl) ||
-                (SubIsOnLHS && ShiftOpcode1 == BinaryOperator::Shl);
+  Value *X, *Y;
+  X = Y = Builder.CreateTrunc(ShVal0, DestTy);
+  if (ShVal0 != ShVal1)
+    Y = Builder.CreateTrunc(ShVal1, DestTy);
    Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
    Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy);
-  return IntrinsicInst::Create(F, { X, X, NarrowShAmt });
+  return IntrinsicInst::Create(F, {X, Y, NarrowShAmt});
  }
  
  /// Try to narrow the width of math or bitwise logic instructions by pulling a
  /// truncate ahead of binary operators.
  /// TODO: Transforms for truncated shifts should be moved into here.
-Instruction *InstCombiner::narrowBinOp(TruncInst &Trunc) {
+Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
    Type *SrcTy = Trunc.getSrcTy();
    Type *DestTy = Trunc.getType();
    if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
@@ -629,7 +663,7 @@ Instruction *InstCombiner::narrowBinOp(TruncInst &Trunc) {
    default: break;
    }
  
-  if (Instruction *NarrowOr = narrowRotate(Trunc))
+  if (Instruction *NarrowOr = narrowFunnelShift(Trunc))
      return NarrowOr;
  
    return nullptr;
@@ -685,30 +719,50 @@ static Instruction *shrinkInsertElt(CastInst &Trunc,
    return nullptr;
  }
  
-Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
-  if (Instruction *Result = commonCastTransforms(CI))
+Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
+  if (Instruction *Result = commonCastTransforms(Trunc))
      return Result;
  
-  Value *Src = CI.getOperand(0);
-  Type *DestTy = CI.getType(), *SrcTy = Src->getType();
-  ConstantInt *Cst;
+  Value *Src = Trunc.getOperand(0);
+  Type *DestTy = Trunc.getType(), *SrcTy = Src->getType();
+  unsigned DestWidth = DestTy->getScalarSizeInBits();
+  unsigned SrcWidth = SrcTy->getScalarSizeInBits();
  
    // Attempt to truncate the entire input expression tree to the destination
    // type.   Only do this if the dest type is a simple type, don't convert the
    // expression tree to something weird like i93 unless the source is also
    // strange.
    if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
-      canEvaluateTruncated(Src, DestTy, *this, &CI)) {
+      canEvaluateTruncated(Src, DestTy, *this, &Trunc)) {
  
      // If this cast is a truncate, evaluting in a different type always
      // eliminates the cast, so it is always a win.
      LLVM_DEBUG(
          dbgs() << "ICE: EvaluateInDifferentType converting expression type"
                    " to avoid cast: "
-               << CI << '\n');
+               << Trunc << '\n');
      Value *Res = EvaluateInDifferentType(Src, DestTy, false);
      assert(Res->getType() == DestTy);
-    return replaceInstUsesWith(CI, Res);
+    return replaceInstUsesWith(Trunc, Res);
+  }
+
+  // For integer types, check if we can shorten the entire input expression to
+  // DestWidth * 2, which won't allow removing the truncate, but reducing the
+  // width may enable further optimizations, e.g. allowing for larger
+  // vectorization factors.
+  if (auto *DestITy = dyn_cast<IntegerType>(DestTy)) {
+    if (DestWidth * 2 < SrcWidth) {
+      auto *NewDestTy = DestITy->getExtendedType();
+      if (shouldChangeType(SrcTy, NewDestTy) &&
+          canEvaluateTruncated(Src, NewDestTy, *this, &Trunc)) {
+        LLVM_DEBUG(
+            dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                      " to reduce the width of operand of"
+                   << Trunc << '\n');
+        Value *Res = EvaluateInDifferentType(Src, NewDestTy, false);
+        return new TruncInst(Res, DestTy);
+      }
+    }
    }
  
    // Test if the trunc is the user of a select which is part of a
@@ -716,17 +770,17 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
    // Even simplifying demanded bits can break the canonical form of a
    // min/max.
    Value *LHS, *RHS;
-  if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
-    if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
+  if (SelectInst *Sel = dyn_cast<SelectInst>(Src))
+    if (matchSelectPattern(Sel, LHS, RHS).Flavor != SPF_UNKNOWN)
        return nullptr;
  
    // See if we can simplify any instructions used by the input whose sole
    // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
+  if (SimplifyDemandedInstructionBits(Trunc))
+    return &Trunc;
  
-  if (DestTy->getScalarSizeInBits() == 1) {
-    Value *Zero = Constant::getNullValue(Src->getType());
+  if (DestWidth == 1) {
+    Value *Zero = Constant::getNullValue(SrcTy);
      if (DestTy->isIntegerTy()) {
        // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
        // TODO: We canonicalize to more instructions here because we are probably
@@ -740,109 +794,106 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
      // For vectors, we do not canonicalize all truncs to icmp, so optimize
      // patterns that would be covered within visitICmpInst.
      Value *X;
-    const APInt *C;
-    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
+    Constant *C;
+    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_Constant(C))))) {
        // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
-      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C);
-      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
+      Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
+      Constant *MaskC = ConstantExpr::getShl(One, C);
+      Value *And = Builder.CreateAnd(X, MaskC);
        return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
      }
-    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)),
+    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_Constant(C)),
                                     m_Deferred(X))))) {
        // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
-      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1;
-      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
+      Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
+      Constant *MaskC = ConstantExpr::getShl(One, C);
+      MaskC = ConstantExpr::getOr(MaskC, One);
+      Value *And = Builder.CreateAnd(X, MaskC);
        return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
      }
    }
  
-  // FIXME: Maybe combine the next two transforms to handle the no cast case
-  // more efficiently. Support vector types. Cleanup code by using m_OneUse.
-
-  // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
-  Value *A = nullptr;
-  if (Src->hasOneUse() &&
-      match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) {
-    // We have three types to worry about here, the type of A, the source of
-    // the truncate (MidSize), and the destination of the truncate. We know that
-    // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
-    // between ASize and ResultSize.
-    unsigned ASize = A->getType()->getPrimitiveSizeInBits();
-
-    // If the shift amount is larger than the size of A, then the result is
-    // known to be zero because all the input bits got shifted out.
-    if (Cst->getZExtValue() >= ASize)
-      return replaceInstUsesWith(CI, Constant::getNullValue(DestTy));
-
-    // Since we're doing an lshr and a zero extend, and know that the shift
-    // amount is smaller than ASize, it is always safe to do the shift in A's
-    // type, then zero extend or truncate to the result.
-    Value *Shift = Builder.CreateLShr(A, Cst->getZExtValue());
-    Shift->takeName(Src);
-    return CastInst::CreateIntegerCast(Shift, DestTy, false);
-  }
-
-  // FIXME: We should canonicalize to zext/trunc and remove this transform.
-  // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type
-  // conversion.
-  // It works because bits coming from sign extension have the same value as
-  // the sign bit of the original value; performing ashr instead of lshr
-  // generates bits of the same value as the sign bit.
-  if (Src->hasOneUse() &&
-      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
-    Value *SExt = cast<Instruction>(Src)->getOperand(0);
-    const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
-    const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
-    const unsigned CISize = CI.getType()->getPrimitiveSizeInBits();
-    const unsigned MaxAmt = SExtSize - std::max(CISize, ASize);
-    unsigned ShiftAmt = Cst->getZExtValue();
-
-    // This optimization can be only performed when zero bits generated by
-    // the original lshr aren't pulled into the value after truncation, so we
-    // can only shift by values no larger than the number of extension bits.
-    // FIXME: Instead of bailing when the shift is too large, use and to clear
-    // the extra bits.
-    if (ShiftAmt <= MaxAmt) {
-      if (CISize == ASize)
-        return BinaryOperator::CreateAShr(A, ConstantInt::get(CI.getType(),
-                                          std::min(ShiftAmt, ASize - 1)));
-      if (SExt->hasOneUse()) {
-        Value *Shift = Builder.CreateAShr(A, std::min(ShiftAmt, ASize - 1));
-        Shift->takeName(Src);
-        return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
+  Value *A;
+  Constant *C;
+  if (match(Src, m_LShr(m_SExt(m_Value(A)), m_Constant(C)))) {
+    unsigned AWidth = A->getType()->getScalarSizeInBits();
+    unsigned MaxShiftAmt = SrcWidth - std::max(DestWidth, AWidth);
+    auto *OldSh = cast<Instruction>(Src);
+    bool IsExact = OldSh->isExact();
+
+    // If the shift is small enough, all zero bits created by the shift are
+    // removed by the trunc.
+    if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULE,
+                                    APInt(SrcWidth, MaxShiftAmt)))) {
+      // trunc (lshr (sext A), C) --> ashr A, C
+      if (A->getType() == DestTy) {
+        Constant *MaxAmt = ConstantInt::get(SrcTy, DestWidth - 1, false);
+        Constant *ShAmt = ConstantExpr::getUMin(C, MaxAmt);
+        ShAmt = ConstantExpr::getTrunc(ShAmt, A->getType());
+        ShAmt = Constant::mergeUndefsWith(ShAmt, C);
+        return IsExact ? BinaryOperator::CreateExactAShr(A, ShAmt)
+                       : BinaryOperator::CreateAShr(A, ShAmt);
+      }
+      // The types are mismatched, so create a cast after shifting:
+      // trunc (lshr (sext A), C) --> sext/trunc (ashr A, C)
+      if (Src->hasOneUse()) {
+        Constant *MaxAmt = ConstantInt::get(SrcTy, AWidth - 1, false);
+        Constant *ShAmt = ConstantExpr::getUMin(C, MaxAmt);
+        ShAmt = ConstantExpr::getTrunc(ShAmt, A->getType());
+        Value *Shift = Builder.CreateAShr(A, ShAmt, "", IsExact);
+        return CastInst::CreateIntegerCast(Shift, DestTy, true);
        }
      }
+    // TODO: Mask high bits with 'and'.
+  }
+
+  // trunc (*shr (trunc A), C) --> trunc(*shr A, C)
+  if (match(Src, m_OneUse(m_Shr(m_Trunc(m_Value(A)), m_Constant(C))))) {
+    unsigned MaxShiftAmt = SrcWidth - DestWidth;
+
+    // If the shift is small enough, all zero/sign bits created by the shift are
+    // removed by the trunc.
+    if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULE,
+                                    APInt(SrcWidth, MaxShiftAmt)))) {
+      auto *OldShift = cast<Instruction>(Src);
+      bool IsExact = OldShift->isExact();
+      auto *ShAmt = ConstantExpr::getIntegerCast(C, A->getType(), true);
+      ShAmt = Constant::mergeUndefsWith(ShAmt, C);
+      Value *Shift =
+          OldShift->getOpcode() == Instruction::AShr
+              ? Builder.CreateAShr(A, ShAmt, OldShift->getName(), IsExact)
+              : Builder.CreateLShr(A, ShAmt, OldShift->getName(), IsExact);
+      return CastInst::CreateTruncOrBitCast(Shift, DestTy);
+    }
    }
  
-  if (Instruction *I = narrowBinOp(CI))
+  if (Instruction *I = narrowBinOp(Trunc))
      return I;
  
-  if (Instruction *I = shrinkSplatShuffle(CI, Builder))
+  if (Instruction *I = shrinkSplatShuffle(Trunc, Builder))
      return I;
  
-  if (Instruction *I = shrinkInsertElt(CI, Builder))
+  if (Instruction *I = shrinkInsertElt(Trunc, Builder))
      return I;
  
-  if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
-      shouldChangeType(SrcTy, DestTy)) {
+  if (Src->hasOneUse() &&
+      (isa<VectorType>(SrcTy) || shouldChangeType(SrcTy, DestTy))) {
      // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
      // dest type is native and cst < dest size.
-    if (match(Src, m_Shl(m_Value(A), m_ConstantInt(Cst))) &&
+    if (match(Src, m_Shl(m_Value(A), m_Constant(C))) &&
          !match(A, m_Shr(m_Value(), m_Constant()))) {
        // Skip shifts of shift by constants. It undoes a combine in
        // FoldShiftByConstant and is the extend in reg pattern.
-      const unsigned DestSize = DestTy->getScalarSizeInBits();
-      if (Cst->getValue().ult(DestSize)) {
+      APInt Threshold = APInt(C->getType()->getScalarSizeInBits(), DestWidth);
+      if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold))) {
          Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr");
-
-        return BinaryOperator::Create(
-          Instruction::Shl, NewTrunc,
-          ConstantInt::get(DestTy, Cst->getValue().trunc(DestSize)));
+        return BinaryOperator::Create(Instruction::Shl, NewTrunc,
+                                      ConstantExpr::getTrunc(C, DestTy));
        }
      }
    }
  
-  if (Instruction *I = foldVecTruncToExtElt(CI, *this))
+  if (Instruction *I = foldVecTruncToExtElt(Trunc, *this))
      return I;
  
    // Whenever an element is extracted from a vector, and then truncated,
@@ -854,23 +905,23 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
    //   --->
    //   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
    Value *VecOp;
+  ConstantInt *Cst;
    if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) {
      auto *VecOpTy = cast<VectorType>(VecOp->getType());
-    unsigned DestScalarSize = DestTy->getScalarSizeInBits();
-    unsigned VecOpScalarSize = VecOpTy->getScalarSizeInBits();
-    unsigned VecNumElts = VecOpTy->getNumElements();
+    auto VecElts = VecOpTy->getElementCount();
  
      // A badly fit destination size would result in an invalid cast.
-    if (VecOpScalarSize % DestScalarSize == 0) {
-      uint64_t TruncRatio = VecOpScalarSize / DestScalarSize;
-      uint64_t BitCastNumElts = VecNumElts * TruncRatio;
+    if (SrcWidth % DestWidth == 0) {
+      uint64_t TruncRatio = SrcWidth / DestWidth;
+      uint64_t BitCastNumElts = VecElts.getKnownMinValue() * TruncRatio;
        uint64_t VecOpIdx = Cst->getZExtValue();
        uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1
                                           : VecOpIdx * TruncRatio;
        assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
               "overflow 32-bits");
  
-      Type *BitCastTo = VectorType::get(DestTy, BitCastNumElts);
+      auto *BitCastTo =
+          VectorType::get(DestTy, BitCastNumElts, VecElts.isScalable());
        Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
        return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
      }
@@ -879,8 +930,8 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
    return nullptr;
  }
  
-Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
-                                             bool DoTransform) {
+Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
+                                                 bool DoTransform) {
    // If we are just checking for a icmp eq of a single bit and zext'ing it
    // to an integer, then shift the bit to the appropriate place and then
    // cast to integer to avoid the comparison.
@@ -1016,7 +1067,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
  ///
  /// This function works on both vectors and scalars.
  static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
-                             InstCombiner &IC, Instruction *CxtI) {
+                             InstCombinerImpl &IC, Instruction *CxtI) {
    BitsToClear = 0;
    if (canAlwaysEvaluateInType(V, Ty))
      return true;
@@ -1121,7 +1172,7 @@ static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
    }
  }
  
-Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
    // If this zero extend is only used by a truncate, let the truncate be
    // eliminated before we try to optimize this zext.
    if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
@@ -1219,6 +1270,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
      ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
      ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
      if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+        LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType() &&
          (transformZExtICmp(LHS, CI, false) ||
           transformZExtICmp(RHS, CI, false))) {
        // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
@@ -1259,7 +1311,8 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
  }
  
  /// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
-Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
+Instruction *InstCombinerImpl::transformSExtICmp(ICmpInst *ICI,
+                                                 Instruction &CI) {
    Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
    ICmpInst::Predicate Pred = ICI->getPredicate();
  
@@ -1395,7 +1448,7 @@ static bool canEvaluateSExtd(Value *V, Type *Ty) {
    return false;
  }
  
-Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
    // If this sign extend is only used by a truncate, let the truncate be
    // eliminated before we try to optimize this sext.
    if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
@@ -1458,30 +1511,33 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
    // for a truncate.  If the source and dest are the same type, eliminate the
    // trunc and extend and just do shifts.  For example, turn:
    //   %a = trunc i32 %i to i8
-  //   %b = shl i8 %a, 6
-  //   %c = ashr i8 %b, 6
+  //   %b = shl i8 %a, C
+  //   %c = ashr i8 %b, C
    //   %d = sext i8 %c to i32
    // into:
-  //   %a = shl i32 %i, 30
-  //   %d = ashr i32 %a, 30
+  //   %a = shl i32 %i, 32-(8-C)
+  //   %d = ashr i32 %a, 32-(8-C)
    Value *A = nullptr;
    // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
-  ConstantInt *BA = nullptr, *CA = nullptr;
-  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)),
-                        m_ConstantInt(CA))) &&
-      BA == CA && A->getType() == CI.getType()) {
-    unsigned MidSize = Src->getType()->getScalarSizeInBits();
-    unsigned SrcDstSize = CI.getType()->getScalarSizeInBits();
-    unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
-    Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
-    A = Builder.CreateShl(A, ShAmtV, CI.getName());
-    return BinaryOperator::CreateAShr(A, ShAmtV);
+  Constant *BA = nullptr, *CA = nullptr;
+  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_Constant(BA)),
+                        m_Constant(CA))) &&
+      BA->isElementWiseEqual(CA) && A->getType() == DestTy) {
+    Constant *WideCurrShAmt = ConstantExpr::getSExt(CA, DestTy);
+    Constant *NumLowbitsLeft = ConstantExpr::getSub(
+        ConstantInt::get(DestTy, SrcTy->getScalarSizeInBits()), WideCurrShAmt);
+    Constant *NewShAmt = ConstantExpr::getSub(
+        ConstantInt::get(DestTy, DestTy->getScalarSizeInBits()),
+        NumLowbitsLeft);
+    NewShAmt =
+        Constant::mergeUndefsWith(Constant::mergeUndefsWith(NewShAmt, BA), CA);
+    A = Builder.CreateShl(A, NewShAmt, CI.getName());
+    return BinaryOperator::CreateAShr(A, NewShAmt);
    }
  
    return nullptr;
  }
  
-
  /// Return a Constant* for the specified floating-point constant if it fits
  /// in the specified FP type without changing its value.
  static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
@@ -1519,7 +1575,7 @@ static Type *shrinkFPConstantVector(Value *V) {
  
    Type *MinType = nullptr;
  
-  unsigned NumElts = CVVTy->getNumElements();
+  unsigned NumElts = cast<FixedVectorType>(CVVTy)->getNumElements();
    for (unsigned i = 0; i != NumElts; ++i) {
      auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
      if (!CFP)
@@ -1536,7 +1592,7 @@ static Type *shrinkFPConstantVector(Value *V) {
    }
  
    // Make a vector type from the minimal type.
-  return VectorType::get(MinType, NumElts);
+  return FixedVectorType::get(MinType, NumElts);
  }
  
  /// Find the minimum FP type we can safely truncate to.
@@ -1600,7 +1656,7 @@ static bool isKnownExactCastIntToFP(CastInst &I) {
    return false;
  }
  
-Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
+Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
    if (Instruction *I = commonCastTransforms(FPT))
      return I;
  
@@ -1741,6 +1797,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
      case Intrinsic::nearbyint:
      case Intrinsic::rint:
      case Intrinsic::round:
+    case Intrinsic::roundeven:
      case Intrinsic::trunc: {
        Value *Src = II->getArgOperand(0);
        if (!Src->hasOneUse())
@@ -1783,7 +1840,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
    return nullptr;
  }
  
-Instruction *InstCombiner::visitFPExt(CastInst &FPExt) {
+Instruction *InstCombinerImpl::visitFPExt(CastInst &FPExt) {
    // If the source operand is a cast from integer to FP and known exact, then
    // cast the integer operand directly to the destination type.
    Type *Ty = FPExt.getType();
@@ -1801,7 +1858,7 @@ Instruction *InstCombiner::visitFPExt(CastInst &FPExt) {
  /// This is safe if the intermediate type has enough bits in its mantissa to
  /// accurately represent all values of X.  For example, this won't work with
  /// i64 -> float -> i64.
-Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) {
+Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
    if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
      return nullptr;
  
@@ -1841,29 +1898,29 @@ Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) {
    return replaceInstUsesWith(FI, X);
  }
  
-Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) {
    if (Instruction *I = foldItoFPtoI(FI))
      return I;
  
    return commonCastTransforms(FI);
  }
  
-Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
    if (Instruction *I = foldItoFPtoI(FI))
      return I;
  
    return commonCastTransforms(FI);
  }
  
-Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
    return commonCastTransforms(CI);
  }
  
-Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
    return commonCastTransforms(CI);
  }
  
-Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
+Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
    // If the source integer type is not the intptr_t type for this target, do a
    // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
    // cast to be exposed to other transforms.
@@ -1886,7 +1943,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
  }
  
  /// Implement the transforms for cast of pointer (bitcast/ptrtoint)
-Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+Instruction *InstCombinerImpl::commonPointerCastTransforms(CastInst &CI) {
    Value *Src = CI.getOperand(0);
  
    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
@@ -1908,23 +1965,37 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
    return commonCastTransforms(CI);
  }
  
-Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
+Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
    // If the destination integer type is not the intptr_t type for this target,
    // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
    // to be exposed to other transforms.
-
+  Value *SrcOp = CI.getPointerOperand();
    Type *Ty = CI.getType();
    unsigned AS = CI.getPointerAddressSpace();
+  unsigned TySize = Ty->getScalarSizeInBits();
+  unsigned PtrSize = DL.getPointerSizeInBits(AS);
+  if (TySize != PtrSize) {
+    Type *IntPtrTy = DL.getIntPtrType(CI.getContext(), AS);
+    // Handle vectors of pointers.
+    if (auto *VecTy = dyn_cast<VectorType>(Ty))
+      IntPtrTy = VectorType::get(IntPtrTy, VecTy->getElementCount());
  
-  if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS))
-    return commonPointerCastTransforms(CI);
+    Value *P = Builder.CreatePtrToInt(SrcOp, IntPtrTy);
+    return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
+  }
  
-  Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
-  if (auto *VTy = dyn_cast<VectorType>(Ty)) // Handle vectors of pointers.
-    PtrTy = VectorType::get(PtrTy, VTy->getNumElements());
+  Value *Vec, *Scalar, *Index;
+  if (match(SrcOp, m_OneUse(m_InsertElt(m_IntToPtr(m_Value(Vec)),
+                                        m_Value(Scalar), m_Value(Index)))) &&
+      Vec->getType() == Ty) {
+    assert(Vec->getType()->getScalarSizeInBits() == PtrSize && "Wrong type");
+    // Convert the scalar to int followed by insert to eliminate one cast:
+    // p2i (ins (i2p Vec), Scalar, Index --> ins Vec, (p2i Scalar), Index
+    Value *NewCast = Builder.CreatePtrToInt(Scalar, Ty->getScalarType());
+    return InsertElementInst::Create(Vec, NewCast, Index);
+  }
  
-  Value *P = Builder.CreatePtrToInt(CI.getOperand(0), PtrTy);
-  return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
+  return commonPointerCastTransforms(CI);
  }
  
  /// This input value (which is known to have vector type) is being zero extended
@@ -1943,9 +2014,9 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
  /// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
  ///
  /// The source and destination vector types may have different element types.
-static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
-                                                            VectorType *DestTy,
-                                                            InstCombiner &IC) {
+static Instruction *
+optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy,
+                                        InstCombinerImpl &IC) {
    // We can only do this optimization if the output is a multiple of the input
    // element size, or the input is a multiple of the output element size.
    // Convert the input type to have the same element type as the output.
@@ -1960,13 +2031,15 @@ static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
          DestTy->getElementType()->getPrimitiveSizeInBits())
        return nullptr;
  
-    SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
+    SrcTy =
+        FixedVectorType::get(DestTy->getElementType(),
+                             cast<FixedVectorType>(SrcTy)->getNumElements());
      InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
    }
  
    bool IsBigEndian = IC.getDataLayout().isBigEndian();
-  unsigned SrcElts = SrcTy->getNumElements();
-  unsigned DestElts = DestTy->getNumElements();
+  unsigned SrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
+  unsigned DestElts = cast<FixedVectorType>(DestTy)->getNumElements();
  
    assert(SrcElts != DestElts && "Element counts should be different.");
  
@@ -2144,8 +2217,8 @@ static bool collectInsertionElements(Value *V, unsigned Shift,
  ///
  /// Into two insertelements that do "buildvector{%inc, %inc5}".
  static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
-                                                InstCombiner &IC) {
-  VectorType *DestVecTy = cast<VectorType>(CI.getType());
+                                                InstCombinerImpl &IC) {
+  auto *DestVecTy = cast<FixedVectorType>(CI.getType());
    Value *IntInput = CI.getOperand(0);
  
    SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
@@ -2173,7 +2246,7 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
  /// vectors better than bitcasts of scalars because vector registers are
  /// usually not type-specific like scalar integer or scalar floating-point.
  static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
-                                              InstCombiner &IC) {
+                                              InstCombinerImpl &IC) {
    // TODO: Create and use a pattern matcher for ExtractElementInst.
    auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
    if (!ExtElt || !ExtElt->hasOneUse())
@@ -2185,8 +2258,7 @@ static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
    if (!VectorType::isValidElementType(DestType))
      return nullptr;
  
-  unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements();
-  auto *NewVecType = VectorType::get(DestType, NumElts);
+  auto *NewVecType = VectorType::get(DestType, ExtElt->getVectorOperandType());
    auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
                                           NewVecType, "bc");
    return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
@@ -2249,12 +2321,11 @@ static Instruction *foldBitCastSelect(BitCastInst &BitCast,
    // A vector select must maintain the same number of elements in its operands.
    Type *CondTy = Cond->getType();
    Type *DestTy = BitCast.getType();
-  if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) {
-    if (!DestTy->isVectorTy())
-      return nullptr;
-    if (cast<VectorType>(DestTy)->getNumElements() != CondVTy->getNumElements())
+  if (auto *CondVTy = dyn_cast<VectorType>(CondTy))
+    if (!DestTy->isVectorTy() ||
+        CondVTy->getElementCount() !=
+            cast<VectorType>(DestTy)->getElementCount())
        return nullptr;
-  }
  
    // FIXME: This transform is restricted from changing the select between
    // scalars and vectors to avoid backend problems caused by creating
@@ -2299,7 +2370,8 @@ static bool hasStoreUsersOnly(CastInst &CI) {
  ///
  /// All the related PHI nodes can be replaced by new PHI nodes with type A.
  /// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
-Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
+Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
+                                                      PHINode *PN) {
    // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
    if (hasStoreUsersOnly(CI))
      return nullptr;
@@ -2429,10 +2501,7 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
    Instruction *RetVal = nullptr;
    for (auto *OldPN : OldPhiNodes) {
      PHINode *NewPN = NewPNodes[OldPN];
-    for (auto It = OldPN->user_begin(), End = OldPN->user_end(); It != End; ) {
-      User *V = *It;
-      // We may remove this user, advance to avoid iterator invalidation.
-      ++It;
+    for (User *V : make_early_inc_range(OldPN->users())) {
        if (auto *SI = dyn_cast<StoreInst>(V)) {
          assert(SI->isSimple() && SI->getOperand(0) == OldPN);
          Builder.SetInsertPoint(SI);
@@ -2452,7 +2521,7 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
          if (BCI == &CI)
            RetVal = I;
        } else if (auto *PHI = dyn_cast<PHINode>(V)) {
-        assert(OldPhiNodes.count(PHI) > 0);
+        assert(OldPhiNodes.contains(PHI));
          (void) PHI;
        } else {
          llvm_unreachable("all uses should be handled");
@@ -2463,7 +2532,7 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
    return RetVal;
  }
  
-Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
    // If the operands are integer typed then apply the integer transforms,
    // otherwise just apply the common ones.
    Value *Src = CI.getOperand(0);
@@ -2475,8 +2544,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
    if (DestTy == Src->getType())
      return replaceInstUsesWith(CI, Src);
  
-  if (PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+  if (isa<PointerType>(SrcTy) && isa<PointerType>(DestTy)) {
      PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    PointerType *DstPTy = cast<PointerType>(DestTy);
      Type *DstElTy = DstPTy->getElementType();
      Type *SrcElTy = SrcPTy->getElementType();
  
@@ -2533,7 +2603,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
      }
    }
  
-  if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
+  if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
      // Beware: messing with this target-specific oddity may cause trouble.
      if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
        Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
@@ -2562,7 +2632,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
      }
    }
  
-  if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
+  if (FixedVectorType *SrcVTy = dyn_cast<FixedVectorType>(SrcTy)) {
      if (SrcVTy->getNumElements() == 1) {
        // If our destination is not a vector, then make this a straight
        // scalar-scalar cast.
@@ -2586,12 +2656,11 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
      // a bitcast to a vector with the same # elts.
      Value *ShufOp0 = Shuf->getOperand(0);
      Value *ShufOp1 = Shuf->getOperand(1);
-    unsigned NumShufElts = Shuf->getType()->getNumElements();
-    unsigned NumSrcVecElts =
-        cast<VectorType>(ShufOp0->getType())->getNumElements();
+    auto ShufElts = cast<VectorType>(Shuf->getType())->getElementCount();
+    auto SrcVecElts = cast<VectorType>(ShufOp0->getType())->getElementCount();
      if (Shuf->hasOneUse() && DestTy->isVectorTy() &&
-        cast<VectorType>(DestTy)->getNumElements() == NumShufElts &&
-        NumShufElts == NumSrcVecElts) {
+        cast<VectorType>(DestTy)->getElementCount() == ShufElts &&
+        ShufElts == SrcVecElts) {
        BitCastInst *Tmp;
        // If either of the operands is a cast from CI.getType(), then
        // evaluating the shuffle in the casted destination's type will allow
@@ -2614,8 +2683,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
      // TODO: We should match the related pattern for bitreverse.
      if (DestTy->isIntegerTy() &&
          DL.isLegalInteger(DestTy->getScalarSizeInBits()) &&
-        SrcTy->getScalarSizeInBits() == 8 && NumShufElts % 2 == 0 &&
-        Shuf->hasOneUse() && Shuf->isReverse()) {
+        SrcTy->getScalarSizeInBits() == 8 &&
+        ShufElts.getKnownMinValue() % 2 == 0 && Shuf->hasOneUse() &&
+        Shuf->isReverse()) {
        assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask");
        assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op");
        Function *Bswap =
@@ -2644,7 +2714,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
    return commonCastTransforms(CI);
  }
  
-Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+Instruction *InstCombinerImpl::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
    // If the destination pointer element type is not the same as the source's
    // first do a bitcast to the destination type, and then the addrspacecast.
    // This allows the cast to be exposed to other transforms.
@@ -2655,10 +2725,9 @@ Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
    Type *DestElemTy = DestTy->getElementType();
    if (SrcTy->getElementType() != DestElemTy) {
      Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
-    if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) {
-      // Handle vectors of pointers.
-      MidTy = VectorType::get(MidTy, VT->getNumElements());
-    }
+    // Handle vectors of pointers.
+    if (VectorType *VT = dyn_cast<VectorType>(CI.getType()))
+      MidTy = VectorType::get(MidTy, VT->getElementCount());
  
      Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
      return new AddrSpaceCastInst(NewBitCast, CI.getType());