From 201896c9fd9d3555b7405cf6bdc6673ab2fc5d27 Mon Sep 17 00:00:00 2001 From: Matthew Simpson Date: Thu, 2 Mar 2017 15:15:35 +0000 Subject: [PATCH] [ARM/AArch64] Update costs for interleaved accesses with wide types After r296750, we're able to match interleaved accesses having types wider than 128 bits. This patch updates the associated TTI costs. Differential Revision: https://reviews.llvm.org/D29675 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296751 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 6 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 6 +- .../LoopVectorize/AArch64/interleaved_cost.ll | 78 +++++++++++++++------- .../LoopVectorize/ARM/interleaved_cost.ll | 55 +++++++++------ 4 files changed, 97 insertions(+), 48 deletions(-) diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index b8833e5a555..97df096143f 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -509,8 +509,10 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // ldN/stN only support legal vector types of size 64 or 128 in bits. - if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) - return Factor; + // Accesses having vector types that are a multiple of 128 bits can be + // matched to more than one ldN/stN instruction. + if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0)) + return Factor * ((SubVecSize + 127) / 128); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 90e4f3a7cc6..ea923c53b00 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -533,9 +533,11 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // vldN/vstN only support legal vector types of size 64 or 128 in bits. - if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128) && + // Accesses having vector types that are a multiple of 128 bits can be + // matched to more than one vldN/vstN instruction. + if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize % 128 == 0) && !VecTy->getScalarType()->isHalfTy()) - return Factor; + return Factor * ((SubVecSize + 127) / 128); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index 796f8c2bda8..5add5a24d76 100644 --- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -44,15 +44,20 @@ entry: br label %for.body ; VF_4-LABEL: Checking a loop in "i16_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_8-LABEL: Checking a loop in "i16_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 @@ -74,16 +79,26 @@ define void @i32_factor_2(%i32.2* %data, i64 %n) { entry: br label %for.body -; VF_2-LABEL: Checking a loop in "i32_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_4-LABEL: Checking a loop in "i32_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 @@ -105,11 +120,26 @@ define void @i64_factor_2(%i64.2* %data, i64 %n) { entry: br label %for.body -; VF_2-LABEL: Checking a loop in "i64_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_2-LABEL: Checking a loop in "i64_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_4-LABEL: Checking a loop in "i64_factor_2" +; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_8-LABEL: Checking a loop in "i64_factor_2" +; VF_8: Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_16-LABEL: Checking a loop in "i64_factor_2" +; VF_16: Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0 diff --git a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll index 685c523d495..c85dc63c31a 100644 --- a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -43,16 +43,21 @@ define void @i16_factor_2(%i16.2* %data, i64 %n) { entry: br label %for.body -; VF_4-LABEL: Checking a loop in "i16_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_8-LABEL: Checking a loop in "i16_factor_2" -; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_4-LABEL: Checking a loop in "i16_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_8-LABEL: Checking a loop in "i16_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_16-LABEL: Checking a loop in "i16_factor_2" +; VF_16: Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0 @@ -74,16 +79,26 @@ define void @i32_factor_2(%i32.2* %data, i64 %n) { entry: br label %for.body -; VF_2-LABEL: Checking a loop in "i32_factor_2" -; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_4-LABEL: Checking a loop in "i32_factor_2" -; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_2-LABEL: Checking a loop in "i32_factor_2" +; VF_2: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_4-LABEL: Checking a loop in "i32_factor_2" +; VF_4: Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_8-LABEL: Checking a loop in "i32_factor_2" +; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_16-LABEL: Checking a loop in "i32_factor_2" +; VF_16: Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0 -- 2.11.0