From af7c00f21360208a4f87a1bcd01f5f268037e1b7 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 25 Aug 2015 16:29:21 +0000 Subject: [PATCH] make fast unaligned memory accesses implicit with SSE4.2 or SSE4a This is a follow-on from the discussion in http://reviews.llvm.org/D12154. This change allows memset/memcpy to use SSE or AVX memory accesses for any chip that has generally fast unaligned memory ops. A motivating use case for this change is a clang invocation that doesn't explicitly set the CPU, but does target a feature that we know only exists on a CPU that supports fast unaligned memops. For example: $ clang -O1 foo.c -mavx This resolves a difference in lowering noted in PR24449: https://llvm.org/bugs/show_bug.cgi?id=24449 Before this patch, we used different store types depending on whether the example can be lowered as a memset or not. Differential Revision: http://reviews.llvm.org/D12288 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245950 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86Subtarget.cpp | 7 +++++++ test/CodeGen/X86/slow-unaligned-mem.ll | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 565ba1ded7e..b23b3c0e99a 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -192,6 +192,13 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); + // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of + // 16-bytes and under that are reasonably fast. These features were + // introduced with Intel's Nehalem/Silvermont and AMD's Family10h + // micro-architectures respectively. + if (hasSSE42() || hasSSE4A()) + IsUAMemUnder32Slow = false; + InstrItins = getInstrItineraryForCPU(CPUName); // It's important to keep the MCSubtargetInfo feature bits in sync with diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll index f8688e3435c..27cbef681b7 100644 --- a/test/CodeGen/X86/slow-unaligned-mem.ll +++ b/test/CodeGen/X86/slow-unaligned-mem.ll @@ -55,6 +55,11 @@ ; Slow chips use 4-byte stores. Fast chips with SSE or later use something other than 4-byte stores. ; Chips that don't have SSE use 4-byte stores either way, so they're not tested. +; Also verify that SSE4.2 or SSE4a imply fast unaligned accesses. + +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4.2 2>&1 | FileCheck %s --check-prefix=FAST +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4a 2>&1 | FileCheck %s --check-prefix=FAST + define void @store_zeros(i8* %a) { ; SLOW-NOT: not a recognized processor ; SLOW-LABEL: store_zeros: -- 2.11.0