From 2f19fc85a4d8ff115cbc984c4c9b2408fdc99855 Mon Sep 17 00:00:00 2001 From: Olivier Sallenave Date: Fri, 6 Mar 2015 23:12:04 +0000 Subject: [PATCH] Do not restrict interleaved unrolling to small loops, depending on the target. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231528 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 7 +++ include/llvm/Analysis/TargetTransformInfoImpl.h | 2 + lib/Analysis/TargetTransformInfo.cpp | 4 ++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 4 ++ lib/Target/PowerPC/PPCTargetTransformInfo.h | 1 + lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +++ .../LoopVectorize/PowerPC/large-loop-rdx.ll | 73 ++++++++++++++++++++++ 7 files changed, 99 insertions(+) create mode 100644 test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 11a5b6af097..aeab0e18982 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -331,6 +331,9 @@ public: /// target. bool shouldBuildLookupTables() const; + /// \brief Don't restrict interleaved unrolling to small loops. + bool enableAggressiveInterleaving(bool LoopHasReductions) const; + /// \brief Return hardware support for population count. PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; @@ -531,6 +534,7 @@ public: virtual unsigned getJumpBufAlignment() = 0; virtual unsigned getJumpBufSize() = 0; virtual bool shouldBuildLookupTables() = 0; + virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0; virtual bool haveFastSqrt(Type *Ty) = 0; virtual unsigned getFPOpCost(Type *Ty) = 0; @@ -648,6 +652,9 @@ public: bool shouldBuildLookupTables() override { return Impl.shouldBuildLookupTables(); } + bool enableAggressiveInterleaving(bool LoopHasReductions) override { + return Impl.enableAggressiveInterleaving(LoopHasReductions); + } PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override { return Impl.getPopcntSupport(IntTyWidthInBit); } diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 3e02c0ce3ca..9d2a7b5366d 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -235,6 +235,8 @@ public: bool shouldBuildLookupTables() { return true; } + bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } + TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) { return TTI::PSK_Software; } diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 4d336363c5f..1b52d4a5502 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -143,6 +143,10 @@ bool TargetTransformInfo::shouldBuildLookupTables() const { return TTIImpl->shouldBuildLookupTables(); } +bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const { + return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); +} + TargetTransformInfo::PopcntSupportKind TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const { return TTIImpl->getPopcntSupport(IntTyWidthInBit); diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 073bbb0c556..b46acd47f31 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -192,6 +192,10 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, BaseT::getUnrollingPreferences(L, UP); } +bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { + return LoopHasReductions; +} + unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasAltivec() && !ST->hasQPX()) return 0; diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index cef70794234..21acea1a36d 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -78,6 +78,7 @@ public: /// \name Vector TTI Implementations /// @{ + bool enableAggressiveInterleaving(bool LoopHasReductions); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 686b8995a31..ffa3fe13df5 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4564,6 +4564,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, return SmallUF; } + // Unroll if this is a large loop (small loops are already dealt with by this + // point) that could benefit from interleaved unrolling. + bool HasReductions = (Legal->getReductionVars()->size() > 0); + if (TTI.enableAggressiveInterleaving(HasReductions)) { + DEBUG(dbgs() << "LV: Unrolling to expose ILP.\n"); + return UF; + } + DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } diff --git a/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll new file mode 100644 index 00000000000..de6595f64ed --- /dev/null +++ b/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll @@ -0,0 +1,73 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; CHECK: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT-NOT: fadd + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-ibm-linux-gnu" + +define void @QLA_F3_r_veq_norm2_V(float* noalias nocapture %r, [3 x { float, float }]* noalias nocapture readonly %a, i32 signext %n) #0 { +entry: + %cmp24 = icmp sgt i32 %n, 0 + br i1 %cmp24, label %for.cond1.preheader.preheader, label %for.end13 + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader + %indvars.iv = phi i64 [ %indvars.iv.next, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ] + %sum.026 = phi double [ %add10.2, %for.cond1.preheader ], [ 0.000000e+00, %for.cond1.preheader.preheader ] + %arrayidx5.realp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 0 + %arrayidx5.real = load float, float* %arrayidx5.realp, align 8 + %arrayidx5.imagp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 1 + %arrayidx5.imag = load float, float* %arrayidx5.imagp, align 8 + %mul = fmul fast float %arrayidx5.real, %arrayidx5.real + %mul9 = fmul fast float %arrayidx5.imag, %arrayidx5.imag + %add = fadd fast float %mul9, %mul + %conv = fpext float %add to double + %add10 = fadd fast double %conv, %sum.026 + %arrayidx5.realp.1 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 0 + %arrayidx5.real.1 = load float, float* %arrayidx5.realp.1, align 8 + %arrayidx5.imagp.1 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 1 + %arrayidx5.imag.1 = load float, float* %arrayidx5.imagp.1, align 8 + %mul.1 = fmul fast float %arrayidx5.real.1, %arrayidx5.real.1 + %mul9.1 = fmul fast float %arrayidx5.imag.1, %arrayidx5.imag.1 + %add.1 = fadd fast float %mul9.1, %mul.1 + %conv.1 = fpext float %add.1 to double + %add10.1 = fadd fast double %conv.1, %add10 + %arrayidx5.realp.2 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 0 + %arrayidx5.real.2 = load float, float* %arrayidx5.realp.2, align 8 + %arrayidx5.imagp.2 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 1 + %arrayidx5.imag.2 = load float, float* %arrayidx5.imagp.2, align 8 + %mul.2 = fmul fast float %arrayidx5.real.2, %arrayidx5.real.2 + %mul9.2 = fmul fast float %arrayidx5.imag.2, %arrayidx5.imag.2 + %add.2 = fadd fast float %mul9.2, %mul.2 + %conv.2 = fpext float %add.2 to double + %add10.2 = fadd fast double %conv.2, %add10.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.cond.for.end13_crit_edge, label %for.cond1.preheader + +for.cond.for.end13_crit_edge: ; preds = %for.cond1.preheader + %add10.2.lcssa = phi double [ %add10.2, %for.cond1.preheader ] + %phitmp = fptrunc double %add10.2.lcssa to float + br label %for.end13 + +for.end13: ; preds = %for.cond.for.end13_crit_edge, %entry + %sum.0.lcssa = phi float [ %phitmp, %for.cond.for.end13_crit_edge ], [ 0.000000e+00, %entry ] + store float %sum.0.lcssa, float* %r, align 4 + ret void +} + -- 2.11.0