From f8d1af0489461409a332399ccd11c0fd64b860d6 Mon Sep 17 00:00:00 2001
From: Amaury Sechet <deadalnix@gmail.com>
Date: Mon, 29 Jan 2018 20:54:33 +0000
Subject: [PATCH] [X86] Avoid using high register trick for test instruction

Summary:
It seems it's main effect is to create addition copies when values are inr register that do not support this trick, which increase register pressure and makes the code bigger.

The main noteworthy regression I was able to observe was pattern of the type (setcc (trunc (and X, C)), 0) where C is such as it would benefit from the hi register trick. To prevent this, a new pattern is added to materialize such pattern using a 32 bits test. This has the added benefit of working with any constant that is materializable as a 32bits immediate, not just the ones that can leverage the high register trick, as demonstrated by the test case in test-shrink.ll using the constant 2049 .

Reviewers: craig.topper, niravd, spatel, hfinkel

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D42646

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323690 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelDAGToDAG.cpp      | 78 ++++++++++-----------------------
 lib/Target/X86/X86InstrArithmetic.td    |  8 ----
 lib/Target/X86/X86InstrInfo.cpp         |  3 --
 lib/Target/X86/X86MacroFusion.cpp       |  1 -
 test/CodeGen/X86/test-shrink.ll         |  9 ++--
 test/CodeGen/X86/testb-je-fusion.ll     |  3 +-
 test/CodeGen/X86/vastart-defs-eflags.ll |  9 ++--
 7 files changed, 29 insertions(+), 82 deletions(-)

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index e325d975486..cbc8eee28b0 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3073,67 +3073,33 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         return;
       }
 
-      // For example, "testl %eax, $2048" to "testb %ah, $8".
-      if (isShiftedUInt<8, 8>(Mask) &&
-          (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
-        // Shift the immediate right by 8 bits.
-        SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8);
-        SDValue Reg = N0.getOperand(0);
-
-        // Extract the h-register.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
-                                                        MVT::i8, Reg);
-
-        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
-        // target GR8_NOREX registers, so make sure the register class is
-        // forced.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
-                                                 MVT::i32, Subreg, ShiftedImm);
-        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
-        // one, do not call ReplaceAllUsesWith.
-        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
-                    SDValue(NewNode, 0));
-        CurDAG->RemoveDeadNode(Node);
-        return;
-      }
-
-      // For example, "testl %eax, $32776" to "testw %ax, $32776".
-      // NOTE: We only want to form TESTW instructions if optimizing for
-      // min size. Otherwise we only save one byte and possibly get a length
-      // changing prefix penalty in the decoders.
-      if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 &&
-          (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
-        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16);
-        SDValue Reg = N0.getOperand(0);
-
-        // Extract the 16-bit subregister.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
-                                                        MVT::i16, Reg);
-
-        // Emit a testw.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
-                                                 Subreg, Imm);
-        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
-        // one, do not call ReplaceAllUsesWith.
-        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
-                    SDValue(NewNode, 0));
-        CurDAG->RemoveDeadNode(Node);
-        return;
-      }
-
       // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
-      if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 &&
+      if (isUInt<32>(Mask) &&
           (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
-        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
+        MVT VT = MVT::i32;
+        int SubRegOp = X86::sub_32bit;
+        unsigned Op = X86::TEST32ri;
+
+        // For example, "testl %eax, $32776" to "testw %ax, $32776".
+        // NOTE: We only want to form TESTW instructions if optimizing for
+        // min size. Otherwise we only save one byte and possibly get a length
+        // changing prefix penalty in the decoders.
+        if (OptForMinSize && isUInt<16>(Mask) &&
+            (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
+          VT = MVT::i16;
+          SubRegOp = X86::sub_16bit;
+          Op = X86::TEST16ri;
+        }
+
+        SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
         SDValue Reg = N0.getOperand(0);
 
-        // Extract the 32-bit subregister.
-        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
-                                                        MVT::i32, Reg);
+        // Extract the subregister if necessary.
+        if (N0.getValueType() != VT)
+          Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
 
-        // Emit a testl.
-        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
-                                                 Subreg, Imm);
+        // Emit a testl or testw.
+        SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm);
         // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index d09deb5b758..d35b4338c72 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1257,14 +1257,6 @@ let isCompare = 1 in {
     def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
     let Predicates = [In64BitMode] in
     def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
-
-    // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
-    // register class is constrained to GR8_NOREX. This pseudo is explicitly
-    // marked side-effect free, since it doesn't have an isel pattern like
-    // other test instructions.
-    let isPseudo = 1, hasSideEffects = 0 in
-    def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
-                          "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
   } // Defs = [EFLAGS]
 
   def TEST8i8    : BinOpAI_F<0xA8, "test", Xi8 , AL,
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index ba5f0f2130f..9c6c8600745 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -8018,9 +8018,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::VMOVUPSZ256mr_NOVLX:
     return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
                             get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
-  case X86::TEST8ri_NOREX:
-    MI.setDesc(get(X86::TEST8ri));
-    return true;
   case X86::MOV32ri64:
     MI.setDesc(get(X86::MOV32ri));
     return true;
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 67d95c2233d..4e11397dec4 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -86,7 +86,6 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   case X86::TEST16mr:
   case X86::TEST32mr:
   case X86::TEST64mr:
-  case X86::TEST8ri_NOREX:
   case X86::AND16i16:
   case X86::AND16ri:
   case X86::AND16ri8:
diff --git a/test/CodeGen/X86/test-shrink.ll b/test/CodeGen/X86/test-shrink.ll
index a054c1f1edb..0cc7849e8e4 100644
--- a/test/CodeGen/X86/test-shrink.ll
+++ b/test/CodeGen/X86/test-shrink.ll
@@ -484,8 +484,7 @@ no:
 define void @truncand32(i16 inreg %x) nounwind {
 ; CHECK-LINUX64-LABEL: truncand32:
 ; CHECK-LINUX64:       # %bb.0:
-; CHECK-LINUX64-NEXT:    andl $2049, %edi # imm = 0x801
-; CHECK-LINUX64-NEXT:    testw %di, %di
+; CHECK-LINUX64-NEXT:    testl $2049, %edi # imm = 0x801
 ; CHECK-LINUX64-NEXT:    je .LBB11_1
 ; CHECK-LINUX64-NEXT:  # %bb.2: # %no
 ; CHECK-LINUX64-NEXT:    retq
@@ -498,8 +497,7 @@ define void @truncand32(i16 inreg %x) nounwind {
 ; CHECK-WIN32-64-LABEL: truncand32:
 ; CHECK-WIN32-64:       # %bb.0:
 ; CHECK-WIN32-64-NEXT:    subq $40, %rsp
-; CHECK-WIN32-64-NEXT:    andl $2049, %ecx # imm = 0x801
-; CHECK-WIN32-64-NEXT:    testw %cx, %cx
+; CHECK-WIN32-64-NEXT:    testl $2049, %ecx # imm = 0x801
 ; CHECK-WIN32-64-NEXT:    je .LBB11_1
 ; CHECK-WIN32-64-NEXT:  # %bb.2: # %no
 ; CHECK-WIN32-64-NEXT:    addq $40, %rsp
@@ -511,8 +509,7 @@ define void @truncand32(i16 inreg %x) nounwind {
 ;
 ; CHECK-X86-LABEL: truncand32:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    andl $2049, %eax # imm = 0x801
-; CHECK-X86-NEXT:    testw %ax, %ax
+; CHECK-X86-NEXT:    testl $2049, %eax # imm = 0x801
 ; CHECK-X86-NEXT:    je .LBB11_1
 ; CHECK-X86-NEXT:  # %bb.2: # %no
 ; CHECK-X86-NEXT:    retl
diff --git a/test/CodeGen/X86/testb-je-fusion.ll b/test/CodeGen/X86/testb-je-fusion.ll
index 9822ad3941d..47453ca6791 100644
--- a/test/CodeGen/X86/testb-je-fusion.ll
+++ b/test/CodeGen/X86/testb-je-fusion.ll
@@ -6,9 +6,8 @@
 define i32 @check_flag(i32 %flags, ...) nounwind {
 ; CHECK-LABEL: check_flag:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edi, %ecx
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    testb $2, %ch
+; CHECK-NEXT:    testl $512, %edi # imm = 0x200
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
 ; CHECK-NEXT:    movl $1, %eax
diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll
index 4c527a7c6c0..6ef691552aa 100644
--- a/test/CodeGen/X86/vastart-defs-eflags.ll
+++ b/test/CodeGen/X86/vastart-defs-eflags.ll
@@ -8,9 +8,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 define i32 @check_flag(i32 %flags, ...) nounwind {
 ; CHECK-LABEL: check_flag:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    subq $48, %rsp
-; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    subq $56, %rsp
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    je LBB0_2
 ; CHECK-NEXT:  ## %bb.1: ## %entry
@@ -29,7 +27,7 @@ define i32 @check_flag(i32 %flags, ...) nounwind {
 ; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    testb $2, %bh
+; CHECK-NEXT:    testl $512, %edi ## imm = 0x200
 ; CHECK-NEXT:    je LBB0_4
 ; CHECK-NEXT:  ## %bb.3: ## %if.then
 ; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
@@ -40,8 +38,7 @@ define i32 @check_flag(i32 %flags, ...) nounwind {
 ; CHECK-NEXT:    movl $8, 0
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:  LBB0_4: ## %if.end
-; CHECK-NEXT:    addq $48, %rsp
-; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    addq $56, %rsp
 ; CHECK-NEXT:    retq
 entry:
   %and = and i32 %flags, 512
-- 
2.11.0