From 201896c9fd9d3555b7405cf6bdc6673ab2fc5d27 Mon Sep 17 00:00:00 2001
From: Matthew Simpson <mssimpso@codeaurora.org>
Date: Thu, 2 Mar 2017 15:15:35 +0000
Subject: [PATCH] [ARM/AArch64] Update costs for interleaved accesses with wide
 types

After r296750, we're able to match interleaved accesses having types wider than
128 bits. This patch updates the associated TTI costs.

Differential Revision: https://reviews.llvm.org/D29675

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296751 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64TargetTransformInfo.cpp  |  6 +-
 lib/Target/ARM/ARMTargetTransformInfo.cpp          |  6 +-
 .../LoopVectorize/AArch64/interleaved_cost.ll      | 78 +++++++++++++++-------
 .../LoopVectorize/ARM/interleaved_cost.ll          | 55 +++++++++------
 4 files changed, 97 insertions(+), 48 deletions(-)

diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8833e5a555..97df096143f 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -509,8 +509,10 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
     unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
-    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
-      return Factor;
+    // Accesses having vector types that are a multiple of 128 bits can be
+    // matched to more than one ldN/stN instruction.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0))
+      return Factor * ((SubVecSize + 127) / 128);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 90e4f3a7cc6..ea923c53b00 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -533,9 +533,11 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
     unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
-    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128) &&
+    // Accesses having vector types that are a multiple of 128 bits can be
+    // matched to more than one vldN/vstN instruction.
+    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0) &&
         !VecTy->getScalarType()->isHalfTy())
-      return Factor;
+      return Factor * ((SubVecSize + 127) / 128);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index 796f8c2bda8..5add5a24d76 100644
--- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -44,15 +44,20 @@ entry:
   br label %for.body
 
 ; VF_4-LABEL: Checking a loop in "i16_factor_2"
-; VF_4:         Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL: Checking a loop in "i16_factor_2"
-; VF_8:         Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
@@ -74,16 +79,26 @@ define void @i32_factor_2(%i32.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; VF_2-LABEL: Checking a loop in "i32_factor_2"
-; VF_2:         Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_2-NEXT:    Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL: Checking a loop in "i32_factor_2"
-; VF_4:         Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_4-NEXT:    Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
@@ -105,11 +120,26 @@ define void @i64_factor_2(%i64.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; VF_2-LABEL: Checking a loop in "i64_factor_2"
-; VF_2:         Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
-; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
-; VF_2-NEXT:    Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
diff --git a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
index 685c523d495..c85dc63c31a 100644
--- a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -43,16 +43,21 @@ define void @i16_factor_2(%i16.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; VF_4-LABEL: Checking a loop in "i16_factor_2"
-; VF_4:         Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_4-NEXT:    Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
-; VF_8-LABEL: Checking a loop in "i16_factor_2"
-; VF_8:         Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
-; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
-; VF_8-NEXT:    Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_4-LABEL:  Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
@@ -74,16 +79,26 @@ define void @i32_factor_2(%i32.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; VF_2-LABEL: Checking a loop in "i32_factor_2"
-; VF_2:         Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_2-NEXT:    Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
-; VF_4-LABEL: Checking a loop in "i32_factor_2"
-; VF_4:         Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
-; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
-; VF_4-NEXT:    Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
-- 
2.11.0