From 7966329ae72a795104e403af95fe0dbfc8503d31 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 1 Apr 2016 17:36:45 +0000 Subject: [PATCH] [x86] avoid intermediate splat for non-zero memsets (PR27100) Follow-up to http://reviews.llvm.org/D18566 and http://reviews.llvm.org/D18676 - where we noticed that an intermediate splat was being generated for memsets of non-zero chars. That was because we told getMemsetStores() to use a 32-bit vector element type, and it happily obliged by producing that constant using an integer multiply. The 16-byte test that was added in D18566 is now equivalent for AVX1 and AVX2 (no splats, just a vector load), but we have PR27141 to track that splat difference. Note that the SSE1 path is not changed in this patch. That can be a follow-up. This patch should resolve PR27100. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@265161 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 3 ++- test/CodeGen/X86/memset-nonzero.ll | 28 ++++++++++------------------ 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 290f92e4eaa..e816e46ae24 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -2039,7 +2039,8 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::v32i8; } if (Subtarget.hasSSE2()) - return MVT::v4i32; + return MVT::v16i8; + // TODO: Can SSE1 handle a byte vector? if (Subtarget.hasSSE1()) return MVT::v4f32; } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll index 5d9a615e9b4..61d126e2547 100644 --- a/test/CodeGen/X86/memset-nonzero.ll +++ b/test/CodeGen/X86/memset-nonzero.ll @@ -12,15 +12,10 @@ define void @memset_16_nonzero_bytes(i8* %x) { ; SSE2-NEXT: movq %rax, (%rdi) ; SSE2-NEXT: retq ; -; AVX1-LABEL: memset_16_nonzero_bytes: -; AVX1: vmovaps {{.*#+}} xmm0 = [707406378,707406378,707406378,707406378] -; AVX1-NEXT: vmovups %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: memset_16_nonzero_bytes: -; AVX2: vbroadcastss {{.*}}(%rip), %xmm0 -; AVX2-NEXT: vmovups %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: memset_16_nonzero_bytes: +; AVX: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: retq ; %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1) ret void @@ -145,19 +140,16 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) { ; SSE2-NEXT: retq ; ; AVX1-LABEL: memset_16_nonconst_bytes: -; AVX1: movzbl %sil, %eax -; AVX1-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1: vmovd %esi, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: memset_16_nonconst_bytes: -; AVX2: movzbl %sil, %eax -; AVX2-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 -; AVX2-NEXT: vmovups %xmm0, (%rdi) +; AVX2: vmovd %esi, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rdi) ; AVX2-NEXT: retq ; tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i32 1, i1 false) -- 2.11.0