From 7e9986fa34a97b6d7ae4102de57baa9f0b7b0e0f Mon Sep 17 00:00:00 2001
From: David Majnemer <david.majnemer@gmail.com>
Date: Sun, 16 Aug 2015 04:52:11 +0000
Subject: [PATCH] [X86] Widen the 'AND' mask if doing so shrinks the encoding
 size

We can set additional bits in a mask given that we know the other
operand of an AND already has some bits set to zero.  This can be more
efficient if doing so allows us to use an instruction which implicitly
sign extends the immediate.

This fixes PR24085.

Differential Revision: http://reviews.llvm.org/D11289

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245169 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp | 63 ++++++++++++++++++++++++++++++++++++--
 test/CodeGen/X86/shift-pair.ll     |  2 +-
 test/CodeGen/X86/win64_frame.ll    |  5 ++-
 test/CodeGen/X86/zext-fold.ll      |  6 ++--
 4 files changed, 67 insertions(+), 9 deletions(-)
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index d37db7f788b..3453bf625cd 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -198,6 +198,7 @@ namespace {
     SDNode *Select(SDNode *N) override;
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
+    SDNode *SelectAndWithSExtImmediate(SDNode *Node, MVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -2208,6 +2209,57 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
   return ResNode;
 }
 
+// Try to shrink the encoding of an AND by setting additional bits in the mask.
+// It is only correct to do so if we know a priori that the other operand of the
+// AND already has those bits set to zero.
+SDNode *X86DAGToDAGISel::SelectAndWithSExtImmediate(SDNode *Node, MVT NVT) {
+  SDValue N0 = Node->getOperand(0);
+  SDValue N1 = Node->getOperand(1);
+
+  if (NVT != MVT::i32 && NVT != MVT::i64)
+    return nullptr;
+
+  auto *Cst = dyn_cast<ConstantSDNode>(N1);
+  if (!Cst)
+    return nullptr;
+
+  // As a heuristic, skip over negative constants.  It turns out not to be
+  // productive to widen the mask.
+  int64_t Val = Cst->getSExtValue();
+  if (Val <= 0)
+    return nullptr;
+
+  // Limit ourselves to constants which already have sign bits to save on
+  // compile time.
+  if ((int8_t)Val >= 0)
+    return nullptr;
+
+  unsigned Opc;
+  switch (NVT.SimpleTy) {
+  default:
+    llvm_unreachable("Unsupported VT!");
+  case MVT::i32:
+    Opc = X86::AND32ri8;
+    break;
+  case MVT::i64:
+    Opc = X86::AND64ri8;
+    break;
+  }
+
+  APInt Op0Zero, Op0One;
+  CurDAG->computeKnownBits(N0, Op0Zero, Op0One);
+  // Grow the mask using the known zero bits.
+  Op0Zero |= Val;
+  // See if the mask can be efficiently encoded using at most NumBits.
+  if (!Op0Zero.isSignedIntN(8))
+    return nullptr;
+
+  SDLoc DL(Node);
+  SDValue NewCst =
+      CurDAG->getTargetConstant(Op0Zero.getSExtValue(), DL, MVT::i8);
+  return CurDAG->getMachineNode(Opc, DL, NVT, N0, NewCst);
+}
+
 SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
@@ -2223,7 +2275,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   switch (Opcode) {
-  default: break;
+  default:
+    break;
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     switch (IntNo) {
@@ -2298,7 +2351,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       return RetVal;
     break;
   }
-  case ISD::AND:
+  case ISD::AND: {
+    if (SDNode *NewNode = SelectAndWithSExtImmediate(Node, NVT)) {
+      ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+      return nullptr;
+    }
+    // FALLTHROUGH
+  }
   case ISD::OR:
   case ISD::XOR: {
     // For operations of the form (x << C1) op C2, check if we can use a smaller
diff --git a/test/CodeGen/X86/shift-pair.ll b/test/CodeGen/X86/shift-pair.ll
index 62e51f002f7..a0cdb27ec23 100644
--- a/test/CodeGen/X86/shift-pair.ll
+++ b/test/CodeGen/X86/shift-pair.ll
@@ -3,7 +3,7 @@
 define i64 @test(i64 %A) {
 ; CHECK: @test
 ; CHECK: shrq $54
-; CHECK: andl $1020
+; CHECK: andq $-4
 ; CHECK: ret
     %B = lshr i64 %A, 56
     %C = shl i64 %B, 2
diff --git a/test/CodeGen/X86/win64_frame.ll b/test/CodeGen/X86/win64_frame.ll
index 477b3144d9e..2c62f4918a7 100644
--- a/test/CodeGen/X86/win64_frame.ll
+++ b/test/CodeGen/X86/win64_frame.ll
@@ -100,9 +100,8 @@ define i32 @f8(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) "no-frame-pointer-elim"="
 
   alloca i32, i32 %a
   ; CHECK:        movl    %ecx, %eax
-  ; CHECK:        leaq    15(,%rax,4), %rcx
-  ; CHECK:        movabsq $34359738352, %rax
-  ; CHECK:        andq    %rcx, %rax
+  ; CHECK:        leaq    15(,%rax,4), %rax
+  ; CHECK:        andq    $-16, %rax
   ; CHECK:        callq   __chkstk
   ; CHECK:        subq    %rax, %rsp
 
diff --git a/test/CodeGen/X86/zext-fold.ll b/test/CodeGen/X86/zext-fold.ll
index a10923f7a80..9757c7a334b 100644
--- a/test/CodeGen/X86/zext-fold.ll
+++ b/test/CodeGen/X86/zext-fold.ll
@@ -8,7 +8,7 @@ define i32 @test1(i8 %x) nounwind readnone {
 }
 ; CHECK: test1
 ; CHECK: movzbl
-; CHECK-NEXT: andl {{.*}}224
+; CHECK-NEXT: andl {{.*}}-32
 
 ;; Multiple uses of %x but easily extensible.
 define i32 @test2(i8 %x) nounwind readnone {
@@ -21,7 +21,7 @@ define i32 @test2(i8 %x) nounwind readnone {
 }
 ; CHECK: test2
 ; CHECK: movzbl
-; CHECK: andl $224
+; CHECK: andl $-32
 ; CHECK: orl $63
 
 declare void @use(i32, i8)
@@ -36,6 +36,6 @@ define void @test3(i8 %x) nounwind readnone {
 ; CHECK: test3
 ; CHECK: movzbl {{[0-9]+}}(%esp), [[REGISTER:%e[a-z]{2}]]
 ; CHECK-NEXT: movl [[REGISTER]], 4(%esp)
-; CHECK-NEXT: andl $224, [[REGISTER]]
+; CHECK-NEXT: andl $-32, [[REGISTER]]
 ; CHECK-NEXT: movl [[REGISTER]], (%esp)
 ; CHECK-NEXT: call{{.*}}use
-- 
2.11.0