From e2e7d46a44d5e7116a3b2ec5fae992b311e7dc42 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 18 Jun 2016 02:38:26 +0000 Subject: [PATCH] [X86][SSE4A] Autoupgrade and remove MOVNTSD/MOVNTSS intrinsics Required better annotation of the instruction defs upon removal of the builtin intrinsic pattern. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273077 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 5 ----- lib/IR/AutoUpgrade.cpp | 25 +++++++++++++++++++++++++ lib/Target/X86/X86InstrSSE.td | 8 ++++---- test/CodeGen/X86/sse4a-upgrade.ll | 39 +++++++++++++++++++++++++++++++++++++++ test/CodeGen/X86/sse4a.ll | 34 ---------------------------------- 5 files changed, 68 insertions(+), 43 deletions(-) create mode 100644 test/CodeGen/X86/sse4a-upgrade.ll diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index fc65efd4f07..547e575c411 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -941,11 +941,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_sse4a_insertq : GCCBuiltin<"__builtin_ia32_insertq">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; - - def int_x86_sse4a_movnt_ss : - Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty], []>; - def int_x86_sse4a_movnt_sd : - Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty], []>; } //===----------------------------------------------------------------------===// diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index d3ee5e8e5db..733eb85cb5b 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -209,6 +209,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "x86.avx2.vinserti128" || Name.startswith("x86.avx.vextractf128.") || Name == "x86.avx2.vextracti128" || + Name.startswith("x86.sse4a.movnt.") || Name.startswith("x86.avx.movnt.") || Name == "x86.sse2.storel.dq" || Name.startswith("x86.sse.storeu.") || @@ -616,6 +617,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *Src = CI->getArgOperand(0); VectorType *DstTy = cast(CI->getType()); Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt"); + } else if (Name.startswith("llvm.x86.sse4a.movnt.")) { + Module *M = F->getParent(); + SmallVector Elts; + Elts.push_back( + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1))); + MDNode *Node = MDNode::get(C, Elts); + + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + + // Nontemporal (unaligned) store of the 0'th element of the float/double + // vector. + Type *SrcEltTy = cast(Arg1->getType())->getElementType(); + PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy); + Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast"); + Value *Extract = + Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement"); + + StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, 1); + SI->setMetadata(M->getMDKindID("nontemporal"), Node); + + // Remove intrinsic. + CI->eraseFromParent(); + return; } else if (Name.startswith("llvm.x86.avx.movnt.")) { Module *M = F->getParent(); SmallVector Elts; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index f589342437d..7d20bda1bf8 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7776,13 +7776,13 @@ def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), // Non-temporal (unaligned) scalar stores. let AddedComplexity = 400 in { // Prefer non-temporal versions +let mayStore = 1, SchedRW = [WriteStore] in { def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), - "movntss\t{$src, $dst|$dst, $src}", - [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS; + "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS; def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), - "movntsd\t{$src, $dst|$dst, $src}", - [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD; + "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD; +} // SchedRW def : Pat<(nontemporalstore FR32:$src, addr:$dst), (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; diff --git a/test/CodeGen/X86/sse4a-upgrade.ll b/test/CodeGen/X86/sse4a-upgrade.ll new file mode 100644 index 00000000000..a129c658f4b --- /dev/null +++ b/test/CodeGen/X86/sse4a-upgrade.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64 + +define void @test_movntss(i8* %p, <4 x float> %a) nounwind optsize ssp { +; X32-LABEL: test_movntss: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movntss %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_movntss: +; X64: # BB#0: +; X64-NEXT: movntss %xmm0, (%rdi) +; X64-NEXT: retq + tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind + ret void +} + +declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>) + +define void @test_movntsd(i8* %p, <2 x double> %a) nounwind optsize ssp { +; X32-LABEL: test_movntsd: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movntsd %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test_movntsd: +; X64: # BB#0: +; X64-NEXT: movntsd %xmm0, (%rdi) +; X64-NEXT: retq + tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind + ret void +} + +declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>) diff --git a/test/CodeGen/X86/sse4a.ll b/test/CodeGen/X86/sse4a.ll index 53d1b3b5429..1f582fb4ed9 100644 --- a/test/CodeGen/X86/sse4a.ll +++ b/test/CodeGen/X86/sse4a.ll @@ -4,40 +4,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64 -define void @test_movntss(i8* %p, <4 x float> %a) nounwind optsize ssp { -; X32-LABEL: test_movntss: -; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movntss %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_movntss: -; X64: # BB#0: -; X64-NEXT: movntss %xmm0, (%rdi) -; X64-NEXT: retq - tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind - ret void -} - -declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>) - -define void @test_movntsd(i8* %p, <2 x double> %a) nounwind optsize ssp { -; X32-LABEL: test_movntsd: -; X32: # BB#0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movntsd %xmm0, (%eax) -; X32-NEXT: retl -; -; X64-LABEL: test_movntsd: -; X64: # BB#0: -; X64-NEXT: movntsd %xmm0, (%rdi) -; X64-NEXT: retq - tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind - ret void -} - -declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>) - define <2 x i64> @test_extrqi(<2 x i64> %x) nounwind uwtable ssp { ; X32-LABEL: test_extrqi: ; X32: # BB#0: -- 2.11.0