[LoopStrengthReduce, x86] don't add cost for a cmp that will be macro-fused (PR35681)

author Sanjay Patel <spatel@rotateright.com>

Mon, 5 Feb 2018 23:43:05 +0000 (23:43 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Mon, 5 Feb 2018 23:43:05 +0000 (23:43 +0000)
author Sanjay Patel <spatel@rotateright.com>
Mon, 5 Feb 2018 23:43:05 +0000 (23:43 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Mon, 5 Feb 2018 23:43:05 +0000 (23:43 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index fe13e6e..b1abc79 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -471,6 +471,11 @@ public:
    bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                       TargetTransformInfo::LSRCost &C2) const;
  
+  /// Return true if the target can fuse a compare and branch.
+  /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost
+  /// calculation for the instructions in a loop.
+  bool canMacroFuseCmp() const;
+
    /// \brief Return true if the target supports masked load/store
    /// AVX2 and AVX-512 targets allow masks for consecutive load and store
    bool isLegalMaskedStore(Type *DataType) const;
@@ -978,6 +983,7 @@ public:
                                       Instruction *I) = 0;
    virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                               TargetTransformInfo::LSRCost &C2) = 0;
+  virtual bool canMacroFuseCmp() = 0;
    virtual bool isLegalMaskedStore(Type *DataType) = 0;
    virtual bool isLegalMaskedLoad(Type *DataType) = 0;
    virtual bool isLegalMaskedScatter(Type *DataType) = 0;
@@ -1197,6 +1203,9 @@ public:
                       TargetTransformInfo::LSRCost &C2) override {
      return Impl.isLSRCostLess(C1, C2);
    }
+  bool canMacroFuseCmp() override {
+    return Impl.canMacroFuseCmp();
+  }
    bool isLegalMaskedStore(Type *DataType) override {
      return Impl.isLegalMaskedStore(DataType);
    }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h

index 73e8aa2..3bc73d5 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -246,6 +246,8 @@ public:
                      C2.ScaleCost, C2.ImmCost, C2.SetupCost);
    }
  
+  bool canMacroFuseCmp() { return false; }
+
    bool isLegalMaskedStore(Type *DataType) { return false; }
  
    bool isLegalMaskedLoad(Type *DataType) { return false; }
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index e046984..3a7b3c4 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -155,6 +155,10 @@ bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
    return TTIImpl->isLSRCostLess(C1, C2);
  }
  
+bool TargetTransformInfo::canMacroFuseCmp() const {
+  return TTIImpl->canMacroFuseCmp();
+}
+
  bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
    return TTIImpl->isLegalMaskedStore(DataType);
  }
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp

index 8571be8..adda349 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2482,6 +2482,10 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      C2.ScaleCost, C2.ImmCost, C2.SetupCost);
  }
  
+bool X86TTIImpl::canMacroFuseCmp() {
+  return ST->hasMacroFusion();
+}
+
  bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
    // The backend can't handle a single element vector.
    if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h

index 6f01a6f..3df8990 100644 (file)
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -120,6 +120,7 @@ public:
                      Type *Ty);
    bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                       TargetTransformInfo::LSRCost &C2);
+  bool canMacroFuseCmp();
    bool isLegalMaskedLoad(Type *DataType);
    bool isLegalMaskedStore(Type *DataType);
    bool isLegalMaskedGather(Type *DataType);
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

index 4b8e228..b7d9a25 100644 (file)
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1343,14 +1343,15 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
  
    // If ICmpZero formula ends with not 0, it could not be replaced by
    // just add or sub. We'll need to compare final result of AddRec.
-  // That means we'll need an additional instruction.
+  // That means we'll need an additional instruction. But if the target can
+  // macro-fuse a compare with a branch, don't count this extra instruction.
    // For -10 + {0, +, 1}:
    // i = i + 1;
    // cmp i, 10
    //
    // For {-10, +, 1}:
    // i = i + 1;
-  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp())
      C.Insns++;
    // Each new AddRec adds 1 instruction to calculation.
    C.Insns += (C.AddRecCost - PrevAddRecCost);
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll

index e3982cc..d699106 100644 (file)
--- a/test/CodeGen/X86/rdrand.ll
+++ b/test/CodeGen/X86/rdrand.ll
@@ -82,35 +82,41 @@ define i32 @CSE() nounwind {
  define void @loop(i32* %p, i32 %n) nounwind {
  ; X86-LABEL: loop:
  ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
  ; X86-NEXT:    testl %eax, %eax
  ; X86-NEXT:    je .LBB3_3
  ; X86-NEXT:  # %bb.1: # %while.body.preheader
  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %edx, %edx
  ; X86-NEXT:    .p2align 4, 0x90
  ; X86-NEXT:  .LBB3_2: # %while.body
  ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    rdrandl %edx
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    leal 4(%ecx), %ecx
-; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    rdrandl %esi
+; X86-NEXT:    movl %esi, (%ecx,%edx,4)
+; X86-NEXT:    addl $1, %edx
+; X86-NEXT:    cmpl %edx, %eax
  ; X86-NEXT:    jne .LBB3_2
  ; X86-NEXT:  .LBB3_3: # %while.end
+; X86-NEXT:    popl %esi
  ; X86-NEXT:    retl
  ;
  ; X64-LABEL: loop:
  ; X64:       # %bb.0: # %entry
  ; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    je .LBB3_2
+; X64-NEXT:    je .LBB3_3
+; X64-NEXT:  # %bb.1: # %while.body.preheader
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    xorl %ecx, %ecx
  ; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_1: # %while.body
+; X64-NEXT:  .LBB3_2: # %while.body
  ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    rdrandl %eax
-; X64-NEXT:    movl %eax, (%rdi)
-; X64-NEXT:    leaq 4(%rdi), %rdi
-; X64-NEXT:    addl $-1, %esi
-; X64-NEXT:    jne .LBB3_1
-; X64-NEXT:  .LBB3_2: # %while.end
+; X64-NEXT:    rdrandl %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx,4)
+; X64-NEXT:    addq $1, %rcx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    jne .LBB3_2
+; X64-NEXT:  .LBB3_3: # %while.end
  ; X64-NEXT:    retq
  entry:
    %tobool1 = icmp eq i32 %n, 0
diff --git a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll

index bf0b574..0be39d3 100644 (file)
--- a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -347,30 +347,31 @@ define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nou
  ; X32-NEXT:    pushl %ebx
  ; X32-NEXT:    pushl %edi
  ; X32-NEXT:    pushl %esi
-; X32-NEXT:    movl $-400, %eax # imm = 0xFE70
+; X32-NEXT:    movl $3, %eax
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
  ; X32-NEXT:    .p2align 4, 0x90
  ; X32-NEXT:  .LBB3_1: # %for.body
  ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movzbl 400(%esi,%eax), %edi
-; X32-NEXT:    movzbl 400(%edx,%eax), %ebx
+; X32-NEXT:    movzbl -3(%esi,%eax), %edi
+; X32-NEXT:    movzbl -3(%edx,%eax), %ebx
  ; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movb %bl, 400(%ecx,%eax)
-; X32-NEXT:    movzbl 401(%esi,%eax), %edi
-; X32-NEXT:    movzbl 401(%edx,%eax), %ebx
+; X32-NEXT:    movb %bl, -3(%ecx,%eax)
+; X32-NEXT:    movzbl -2(%esi,%eax), %edi
+; X32-NEXT:    movzbl -2(%edx,%eax), %ebx
  ; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movb %bl, 401(%ecx,%eax)
-; X32-NEXT:    movzbl 402(%esi,%eax), %edi
-; X32-NEXT:    movzbl 402(%edx,%eax), %ebx
+; X32-NEXT:    movb %bl, -2(%ecx,%eax)
+; X32-NEXT:    movzbl -1(%esi,%eax), %edi
+; X32-NEXT:    movzbl -1(%edx,%eax), %ebx
  ; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movb %bl, 402(%ecx,%eax)
-; X32-NEXT:    movzbl 403(%esi,%eax), %edi
-; X32-NEXT:    movzbl 403(%edx,%eax), %ebx
+; X32-NEXT:    movb %bl, -1(%ecx,%eax)
+; X32-NEXT:    movzbl (%esi,%eax), %edi
+; X32-NEXT:    movzbl (%edx,%eax), %ebx
  ; X32-NEXT:    addl %edi, %ebx
-; X32-NEXT:    movb %bl, 403(%ecx,%eax)
+; X32-NEXT:    movb %bl, (%ecx,%eax)
  ; X32-NEXT:    addl $4, %eax
+; X32-NEXT:    cmpl $403, %eax # imm = 0x193
  ; X32-NEXT:    jne .LBB3_1
  ; X32-NEXT:  # %bb.2: # %for.end
  ; X32-NEXT:    popl %esi
diff --git a/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll b/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll

index 219aed4..ba0da16 100644 (file)
--- a/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll
@@ -43,27 +43,22 @@ define void @maxArray(double* noalias nocapture %x, double* noalias nocapture re
  ;
  ; HSW-LABEL: @maxArray(
  ; HSW-NEXT:  entry:
-; HSW-NEXT:    [[Y1:%.*]] = bitcast double* [[Y:%.*]] to i8*
-; HSW-NEXT:    [[X3:%.*]] = bitcast double* [[X:%.*]] to i8*
  ; HSW-NEXT:    br label [[VECTOR_BODY:%.*]]
  ; HSW:       vector.body:
-; HSW-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ]
-; HSW-NEXT:    [[UGLYGEP7:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
-; HSW-NEXT:    [[UGLYGEP78:%.*]] = bitcast i8* [[UGLYGEP7]] to <2 x double>*
-; HSW-NEXT:    [[SCEVGEP9:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP78]], i64 32768
-; HSW-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y1]], i64 [[LSR_IV]]
-; HSW-NEXT:    [[UGLYGEP2:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>*
-; HSW-NEXT:    [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP2]], i64 32768
-; HSW-NEXT:    [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP9]], align 8
-; HSW-NEXT:    [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8
+; HSW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; HSW-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]]
+; HSW-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>*
+; HSW-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]]
+; HSW-NEXT:    [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>*
+; HSW-NEXT:    [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8
+; HSW-NEXT:    [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8
  ; HSW-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
  ; HSW-NEXT:    [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
-; HSW-NEXT:    [[UGLYGEP4:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
-; HSW-NEXT:    [[UGLYGEP45:%.*]] = bitcast i8* [[UGLYGEP4]] to <2 x double>*
-; HSW-NEXT:    [[SCEVGEP6:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP45]], i64 32768
-; HSW-NEXT:    store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP6]], align 8
-; HSW-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16
-; HSW-NEXT:    [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; HSW-NEXT:    [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]]
+; HSW-NEXT:    [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>*
+; HSW-NEXT:    store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8
+; HSW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; HSW-NEXT:    [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
  ; HSW-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
  ; HSW:       exit:
  ; HSW-NEXT:    ret void
@@ -85,15 +80,16 @@ define void @maxArray(double* noalias nocapture %x, double* noalias nocapture re
  ;
  ; FUSE-LABEL: maxArray:
  ; FUSE:       # %bb.0: # %entry
-; FUSE-NEXT:    movq $-524288, %rax # imm = 0xFFF80000
+; FUSE-NEXT:    xorl %eax, %eax
  ; FUSE-NEXT:    .p2align 4, 0x90
  ; FUSE-NEXT:  .LBB0_1: # %vector.body
  ; FUSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; FUSE-NEXT:    movupd 524288(%rdi,%rax), %xmm0
-; FUSE-NEXT:    movupd 524288(%rsi,%rax), %xmm1
+; FUSE-NEXT:    movupd (%rdi,%rax,8), %xmm0
+; FUSE-NEXT:    movupd (%rsi,%rax,8), %xmm1
  ; FUSE-NEXT:    maxpd %xmm0, %xmm1
-; FUSE-NEXT:    movupd %xmm1, 524288(%rdi,%rax)
-; FUSE-NEXT:    addq $16, %rax
+; FUSE-NEXT:    movupd %xmm1, (%rdi,%rax,8)
+; FUSE-NEXT:    addq $2, %rax
+; FUSE-NEXT:    cmpq $65536, %rax # imm = 0x10000
  ; FUSE-NEXT:    jne .LBB0_1
  ; FUSE-NEXT:  # %bb.2: # %exit
  ; FUSE-NEXT:    retq
author	Sanjay Patel <spatel@rotateright.com>
	Mon, 5 Feb 2018 23:43:05 +0000 (23:43 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Mon, 5 Feb 2018 23:43:05 +0000 (23:43 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.cpp		patch \| blob \| history
lib/Target/X86/X86TargetTransformInfo.h		patch \| blob \| history
lib/Transforms/Scalar/LoopStrengthReduce.cpp		patch \| blob \| history
test/CodeGen/X86/rdrand.ll		patch \| blob \| history
test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll		patch \| blob \| history
test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll		patch \| blob \| history