llvm_i8_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_sse4a_insertq : GCCBuiltin<"__builtin_ia32_insertq">,
Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
-
- def int_x86_sse4a_movnt_ss :
- Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty], []>;
- def int_x86_sse4a_movnt_sd :
- Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty], []>;
}
//===----------------------------------------------------------------------===//
Name == "x86.avx2.vinserti128" ||
Name.startswith("x86.avx.vextractf128.") ||
Name == "x86.avx2.vextracti128" ||
+ Name.startswith("x86.sse4a.movnt.") ||
Name.startswith("x86.avx.movnt.") ||
Name == "x86.sse2.storel.dq" ||
Name.startswith("x86.sse.storeu.") ||
Value *Src = CI->getArgOperand(0);
VectorType *DstTy = cast<VectorType>(CI->getType());
Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
+ } else if (Name.startswith("llvm.x86.sse4a.movnt.")) {
+ Module *M = F->getParent();
+ SmallVector<Metadata *, 1> Elts;
+ Elts.push_back(
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+ MDNode *Node = MDNode::get(C, Elts);
+
+ Value *Arg0 = CI->getArgOperand(0);
+ Value *Arg1 = CI->getArgOperand(1);
+
+ // Nontemporal (unaligned) store of the 0'th element of the float/double
+ // vector.
+ Type *SrcEltTy = cast<VectorType>(Arg1->getType())->getElementType();
+ PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy);
+ Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast");
+ Value *Extract =
+ Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");
+
+ StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, 1);
+ SI->setMetadata(M->getMDKindID("nontemporal"), Node);
+
+ // Remove intrinsic.
+ CI->eraseFromParent();
+ return;
} else if (Name.startswith("llvm.x86.avx.movnt.")) {
Module *M = F->getParent();
SmallVector<Metadata *, 1> Elts;
// Non-temporal (unaligned) scalar stores.
let AddedComplexity = 400 in { // Prefer non-temporal versions
+let mayStore = 1, SchedRW = [WriteStore] in {
def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
- "movntss\t{$src, $dst|$dst, $src}",
- [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+ "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
- "movntsd\t{$src, $dst|$dst, $src}",
- [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+ "movntsd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
+} // SchedRW
def : Pat<(nontemporalstore FR32:$src, addr:$dst),
(MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64
+
+define void @test_movntss(i8* %p, <4 x float> %a) nounwind optsize ssp {
+; X32-LABEL: test_movntss:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntss %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_movntss:
+; X64: # BB#0:
+; X64-NEXT: movntss %xmm0, (%rdi)
+; X64-NEXT: retq
+ tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
+ ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
+
+define void @test_movntsd(i8* %p, <2 x double> %a) nounwind optsize ssp {
+; X32-LABEL: test_movntsd:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movntsd %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: test_movntsd:
+; X64: # BB#0:
+; X64-NEXT: movntsd %xmm0, (%rdi)
+; X64-NEXT: retq
+ tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
+ ret void
+}
+
+declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a,+avx | FileCheck %s --check-prefix=X64
-define void @test_movntss(i8* %p, <4 x float> %a) nounwind optsize ssp {
-; X32-LABEL: test_movntss:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movntss %xmm0, (%eax)
-; X32-NEXT: retl
-;
-; X64-LABEL: test_movntss:
-; X64: # BB#0:
-; X64-NEXT: movntss %xmm0, (%rdi)
-; X64-NEXT: retq
- tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
- ret void
-}
-
-declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
-
-define void @test_movntsd(i8* %p, <2 x double> %a) nounwind optsize ssp {
-; X32-LABEL: test_movntsd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movntsd %xmm0, (%eax)
-; X32-NEXT: retl
-;
-; X64-LABEL: test_movntsd:
-; X64: # BB#0:
-; X64-NEXT: movntsd %xmm0, (%rdi)
-; X64-NEXT: retq
- tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
- ret void
-}
-
-declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
-
define <2 x i64> @test_extrqi(<2 x i64> %x) nounwind uwtable ssp {
; X32-LABEL: test_extrqi:
; X32: # BB#0: