From e5af2d3a224d4b38760a26d237cde040cb6e14eb Mon Sep 17 00:00:00 2001
From: Dan Gohman <gohman@apple.com>
Date: Thu, 29 Jan 2009 01:59:02 +0000
Subject: [PATCH] Make x86's BT instruction matching more thorough, and add
 some dagcombines that help it match in several more cases. Add several more
 cases to test/CodeGen/X86/bt.ll. This doesn't yet include matching for BT
 with an immediate operand, it just covers more register+register cases.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@63266 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h        |   2 +
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp    |  50 ++--
 lib/CodeGen/SelectionDAG/TargetLowering.cpp |  43 ++-
 lib/Target/X86/X86ISelLowering.cpp          |  70 +++--
 test/CodeGen/X86/bt.ll                      | 420 +++++++++++++++++++++++++++-
 test/CodeGen/X86/commute-cmov.ll            |  17 +-
 6 files changed, 562 insertions(+), 40 deletions(-)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index e6f18ea43f8..4ec7d3f6279 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -780,6 +780,8 @@ public:
     SDValue CombineTo(SDNode *N, const std::vector<SDValue> &To);
     SDValue CombineTo(SDNode *N, SDValue Res);
     SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1);
+
+    void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
   };
 
   /// SimplifySetCC - Try to simplify a setcc built with the specified operands 
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 48e556b2362..848051940f7 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -102,6 +102,8 @@ namespace {
       SDValue To[] = { Res0, Res1 };
       return CombineTo(N, To, 2, AddTo);
     }
+
+    void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
     
   private:    
     
@@ -298,6 +300,10 @@ CombineTo(SDNode *N, SDValue Res0, SDValue Res1) {
   return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1);
 }
 
+void TargetLowering::DAGCombinerInfo::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
+}
 
 //===----------------------------------------------------------------------===//
 // Helper Functions
@@ -539,29 +545,14 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
   return SDValue(N, 0);
 }
 
-/// SimplifyDemandedBits - Check the specified integer node value to see if
-/// it can be simplified or if things it uses can be simplified by bit
-/// propagation.  If so, return true.
-bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
-  TargetLowering::TargetLoweringOpt TLO(DAG);
-  APInt KnownZero, KnownOne;
-  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
-    return false;
-  
-  // Revisit the node.
-  AddToWorkList(Op.getNode());
-  
-  // Replace the old value with the new one.
-  ++NodesCombined;
-  DOUT << "\nReplacing.2 "; DEBUG(TLO.Old.getNode()->dump(&DAG));
-  DOUT << "\nWith: "; DEBUG(TLO.New.getNode()->dump(&DAG));
-  DOUT << '\n';
-  
+void
+DAGCombiner::CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &
+                                                                          TLO) {
   // Replace all uses.  If any nodes become isomorphic to other nodes and 
   // are deleted, make sure to remove them from our worklist.
   WorkListRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New, &DeadNodes);
-  
+
   // Push the new node and any (possibly new) users onto the worklist.
   AddToWorkList(TLO.New.getNode());
   AddUsersToWorkList(TLO.New.getNode());
@@ -580,6 +571,27 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
     
     DAG.DeleteNode(TLO.Old.getNode());
   }
+}
+
+/// SimplifyDemandedBits - Check the specified integer node value to see if
+/// it can be simplified or if things it uses can be simplified by bit
+/// propagation.  If so, return true.
+bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
+  TargetLowering::TargetLoweringOpt TLO(DAG);
+  APInt KnownZero, KnownOne;
+  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    return false;
+  
+  // Revisit the node.
+  AddToWorkList(Op.getNode());
+  
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  DOUT << "\nReplacing.2 "; DEBUG(TLO.Old.getNode()->dump(&DAG));
+  DOUT << "\nWith: "; DEBUG(TLO.New.getNode()->dump(&DAG));
+  DOUT << '\n';
+  
+  CommitTargetLoweringOpt(TLO);
   return true;
 }
 
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7245244684d..e479e05bdd7 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -724,7 +724,7 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op, 
                                                         const APInt &Demanded) {
   // FIXME: ISD::SELECT, ISD::SELECT_CC
-  switch(Op.getOpcode()) {
+  switch (Op.getOpcode()) {
   default: break;
   case ISD::AND:
   case ISD::OR:
@@ -1054,6 +1054,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     }
     break;
   case ISD::SRA:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask == 1)
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, Op.getValueType(),
+                                               Op.getOperand(0), Op.getOperand(1)));
+
     if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       MVT VT = Op.getValueType();
       unsigned ShAmt = SA->getZExtValue();
@@ -1332,6 +1340,21 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   return 1;
 }
 
+static bool ValueHasAtMostOneBitSet(SDValue Val, const SelectionDAG &DAG) {
+  // Logical shift right or left won't ever introduce new set bits.
+  // We check for this case because we don't care which bits are
+  // set, but ComputeMaskedBits won't know anything unless it can
+  // determine which specific bits may be set.
+  if (Val.getOpcode() == ISD::SHL || Val.getOpcode() == ISD::SRL)
+    return ValueHasAtMostOneBitSet(Val.getOperand(0), DAG);
+
+  MVT OpVT = Val.getValueType();
+  unsigned BitWidth = OpVT.getSizeInBits();
+  APInt Mask = APInt::getAllOnesValue(BitWidth);
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(Val, Mask, KnownZero, KnownOne);
+  return KnownZero.countPopulation() == BitWidth - 1;
+}
 
 /// SimplifySetCC - Try to simplify a setcc built with the specified operands 
 /// and cc. If it is unable to simplify it, return a null SDValue.
@@ -1791,6 +1814,24 @@ TargetLowering::SimplifySetCC(MVT VT, SDValue N0, SDValue N1,
         }
       }
     }
+
+    // Simpify x&y == y to x&y == 0 if y has exactly one bit set.
+    if (N0.getOpcode() == ISD::AND)
+      if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
+        if (ValueHasAtMostOneBitSet(N1, DAG)) {
+          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+          SDValue Zero = DAG.getConstant(0, N1.getValueType());
+          return DAG.getSetCC(VT, N0, Zero, Cond);
+        }
+      }
+    if (N1.getOpcode() == ISD::AND)
+      if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
+        if (ValueHasAtMostOneBitSet(N0, DAG)) {
+          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+          SDValue Zero = DAG.getConstant(0, N0.getValueType());
+          return DAG.getSetCC(VT, N1, Zero, Cond);
+        }
+      }
   }
 
   // Fold away ALL boolean setcc's.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6ec97e2db60..bf7c704b9ba 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5114,22 +5114,39 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
   SDValue Op1 = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   
-  // Lower (X & (1 << N)) == 0 to BT.
-  // Lower ((X >>u N) & 1) != 0 to BT.
-  // Lower ((X >>s N) & 1) != 0 to BT.
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   if (Op0.getOpcode() == ISD::AND &&
       Op0.hasOneUse() &&
       Op1.getOpcode() == ISD::Constant &&
-      Op0.getOperand(1).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
-    ConstantSDNode *CmpRHS = cast<ConstantSDNode>(Op1);
-    SDValue AndLHS = Op0.getOperand(0);
-    if (CmpRHS->getZExtValue() == 0 && AndRHS->getZExtValue() == 1 &&
-        AndLHS.getOpcode() == ISD::SRL) {
-      SDValue LHS = AndLHS.getOperand(0);
-      SDValue RHS = AndLHS.getOperand(1);
+    SDValue LHS, RHS;
+    if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
+      if (ConstantSDNode *Op010C =
+            dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
+        if (Op010C->getZExtValue() == 1) {
+          LHS = Op0.getOperand(0);
+          RHS = Op0.getOperand(1).getOperand(1);
+        }
+    } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
+      if (ConstantSDNode *Op000C =
+            dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
+        if (Op000C->getZExtValue() == 1) {
+          LHS = Op0.getOperand(1);
+          RHS = Op0.getOperand(0).getOperand(1);
+        }
+    } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
+      ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
+      SDValue AndLHS = Op0.getOperand(0);
+      if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+        LHS = AndLHS.getOperand(0);
+        RHS = AndLHS.getOperand(1);
+      }
+    }
 
+    if (LHS.getNode()) {
       // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
       // instruction.  Since the shift amount is in-range-or-undefined, we know
       // that doing a bittest on the i16 value is ok.  We extend to i32 because
@@ -5141,10 +5158,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
       // BT ignores high bits (like shifts) we can use anyextend.
       if (LHS.getValueType() != RHS.getValueType())
         RHS = DAG.getNode(ISD::ANY_EXTEND, LHS.getValueType(), RHS);
-      
+
       SDValue BT = DAG.getNode(X86ISD::BT, MVT::i32, LHS, RHS);
       unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
-      return DAG.getNode(X86ISD::SETCC, MVT::i8, 
+      return DAG.getNode(X86ISD::SETCC, MVT::i8,
                          DAG.getConstant(Cond, MVT::i8), BT);
     }
   }
@@ -5295,7 +5312,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
         !isScalarFPTypeInSSEReg(VT))  // FPStack?
       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
     
-    if (isX86LogicalCmp(Opc) && !IllegalFPCMov) {
+    if ((isX86LogicalCmp(Opc) && !IllegalFPCMov) || Opc == X86ISD::BT) { // FIXME
       Cond = Cmp;
       addTest = false;
     }
@@ -7547,6 +7564,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
 
 /// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
 static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI,
                                          const X86Subtarget *Subtarget,
                                          const TargetLowering &TLI) {
   unsigned NumOps = N->getNumOperands();
@@ -7587,7 +7605,9 @@ static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
   SDVTList Tys = DAG.getVTList(VT, MVT::Other);
   SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
   SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, Tys, Ops, 2);
-  DAG.ReplaceAllUsesOfValueWith(SDValue(Base, 1), ResNode.getValue(1));
+  TargetLowering::TargetLoweringOpt TLO(DAG);
+  TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1));
+  DCI.CommitTargetLoweringOpt(TLO);
   return ResNode;
 }                                           
 
@@ -7875,6 +7895,23 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue PerformBTCombine(SDNode *N,
+                                SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  // BT ignores high bits in the bit index operand.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.hasOneUse()) {
+    unsigned BitWidth = Op1.getValueSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG);
+    TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return SDValue();
+}
 
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -7883,7 +7920,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
   case ISD::BUILD_VECTOR:
-    return PerformBuildVectorCombine(N, DAG, Subtarget, *this);
+    return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this);
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
   case ISD::SHL:
   case ISD::SRA:
@@ -7892,6 +7929,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
+  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   }
 
   return SDValue();
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index 86254d3295b..f91130dd69a 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc | grep btl
+; RUN: llvm-as < %s | llc -march=x86 | grep btl | count 28
 ; RUN: llvm-as < %s | llc -mcpu=pentium4 | grep btl | not grep esp
 ; RUN: llvm-as < %s | llc -mcpu=penryn   | grep btl | not grep esp
 ; PR3253
@@ -7,8 +7,17 @@
 ; pentium4, however it is currently disabled due to the register+memory
 ; form having different semantics than the register+register form.
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin8"
+; Test these patterns:
+;    (X & (1 << N))  != 0  -->  BT(X, N).
+;    ((X >>u N) & 1) != 0  -->  BT(X, N).
+; as well as several variations:
+;    - The second form can use an arithmetic shift.
+;    - Either form can use == instead of !=.
+;    - Either form can compare with an operand of the &
+;      instead of with 0.
+;    - The comparison can be commuted (only cases where neither
+;      operand is constant are included).
+;    - The and can be commuted.
 
 define void @test2(i32 %x, i32 %n) nounwind {
 entry:
@@ -25,4 +34,409 @@ UnifiedReturnBlock:		; preds = %entry
 	ret void
 }
 
+define void @test2b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 1, %tmp29
+	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @atest2(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
+	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @atest2b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 1, %tmp29
+	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @test3(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
+	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @test3b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %x, %tmp29
+	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @testne2(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
+	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @testne2b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 1, %tmp29
+	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @atestne2(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
+	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @atestne2b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 1, %tmp29
+	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @testne3(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
+	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @testne3b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %x, %tmp29
+	%tmp4 = icmp ne i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @query2(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
+	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @query2b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 1, %tmp29
+	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @aquery2(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
+	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @aquery2b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 1, %tmp29
+	%tmp4 = icmp eq i32 %tmp3, 1		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @query3(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
+	%tmp4 = icmp eq i32 %tmp3, %tmp29		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @query3b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %x, %tmp29
+	%tmp4 = icmp eq i32 %tmp3, %tmp29		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @query3x(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
+	%tmp4 = icmp eq i32 %tmp29, %tmp3		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @query3bx(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %x, %tmp29
+	%tmp4 = icmp eq i32 %tmp29, %tmp3		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @queryne2(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
+	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @queryne2b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 1, %tmp29
+	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @aqueryne2(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
+	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @aqueryne2b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 1, %tmp29
+	%tmp4 = icmp ne i32 %tmp3, 1		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @queryne3(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
+	%tmp4 = icmp ne i32 %tmp3, %tmp29		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @queryne3b(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %x, %tmp29
+	%tmp4 = icmp ne i32 %tmp3, %tmp29		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @queryne3x(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
+	%tmp4 = icmp ne i32 %tmp29, %tmp3		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
+define void @queryne3bx(i32 %x, i32 %n) nounwind {
+entry:
+	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
+	%tmp3 = and i32 %x, %tmp29
+	%tmp4 = icmp ne i32 %tmp29, %tmp3		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+
+bb:		; preds = %entry
+	call void @foo()
+	ret void
+
+UnifiedReturnBlock:		; preds = %entry
+	ret void
+}
+
 declare void @foo()
diff --git a/test/CodeGen/X86/commute-cmov.ll b/test/CodeGen/X86/commute-cmov.ll
index 24398dc1257..ac0e4ef3e57 100644
--- a/test/CodeGen/X86/commute-cmov.ll
+++ b/test/CodeGen/X86/commute-cmov.ll
@@ -1,5 +1,20 @@
-; RUN: llvm-as < %s | llc -march=x86 | grep {cmove	16(%esp)}
+; RUN: llvm-as < %s | llc -march=x86 > %t
+; RUN: grep btl %t | count 2
+; RUN: grep cmov %t | count 2
+; RUN: not grep test %t
+; RUN: not grep set %t
+; RUN: not grep j %t
+; RUN: not grep cmovne %t
+; RUN: not grep cmove %t
 
+define i32 @foo(i32 %x, i32 %n, i32 %w, i32 %v) nounwind readnone {
+entry:
+	%0 = lshr i32 %x, %n		; <i32> [#uses=1]
+	%1 = and i32 %0, 1		; <i32> [#uses=1]
+	%toBool = icmp eq i32 %1, 0		; <i1> [#uses=1]
+	%.0 = select i1 %toBool, i32 %v, i32 12		; <i32> [#uses=1]
+	ret i32 %.0
+}
 define i32 @bar(i32 %x, i32 %n, i32 %w, i32 %v) nounwind readnone {
 entry:
 	%0 = lshr i32 %x, %n		; <i32> [#uses=1]
-- 
2.11.0