From: Simon Pilgrim Date: Fri, 27 May 2016 09:02:25 +0000 (+0000) Subject: Revert: r270973 - [X86][SSE] Replace (V)PMOVSX and (V)PMOVZX integer extension intrin... X-Git-Tag: android-x86-7.1-r4~32715 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=38080264564204007aed70899a31ba29986b2d4e;p=android-x86%2Fexternal-llvm.git Revert: r270973 - [X86][SSE] Replace (V)PMOVSX and (V)PMOVZX integer extension intrinsics with generic IR (llvm) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@270976 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 99cfd6c276e..6ebf02d8997 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -756,6 +756,28 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_i32_ty], [IntrNoMem]>; } +// Vector sign and zero extend +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_sse41_pmovzxbd : GCCBuiltin<"__builtin_ia32_pmovzxbd128">, + Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_sse41_pmovzxbq : GCCBuiltin<"__builtin_ia32_pmovzxbq128">, + Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_sse41_pmovzxbw : GCCBuiltin<"__builtin_ia32_pmovzxbw128">, + Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_sse41_pmovzxdq : GCCBuiltin<"__builtin_ia32_pmovzxdq128">, + Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_sse41_pmovzxwd : GCCBuiltin<"__builtin_ia32_pmovzxwd128">, + Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], + [IntrNoMem]>; + def int_x86_sse41_pmovzxwq : GCCBuiltin<"__builtin_ia32_pmovzxwq128">, + Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], + [IntrNoMem]>; +} + // Vector min element let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_phminposuw : GCCBuiltin<"__builtin_ia32_phminposuw128">, @@ -2844,6 +2866,46 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; } +// Vector sign and zero extend +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_pmovsxbd : GCCBuiltin<"__builtin_ia32_pmovsxbd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxbq : GCCBuiltin<"__builtin_ia32_pmovsxbq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxbw : GCCBuiltin<"__builtin_ia32_pmovsxbw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxdq : GCCBuiltin<"__builtin_ia32_pmovsxdq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxwd : GCCBuiltin<"__builtin_ia32_pmovsxwd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i16_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxwq : GCCBuiltin<"__builtin_ia32_pmovsxwq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v8i16_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxbd : GCCBuiltin<"__builtin_ia32_pmovzxbd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxbq : GCCBuiltin<"__builtin_ia32_pmovzxbq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxbw : GCCBuiltin<"__builtin_ia32_pmovzxbw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxdq : GCCBuiltin<"__builtin_ia32_pmovzxdq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxwd : GCCBuiltin<"__builtin_ia32_pmovzxwd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i16_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxwq : GCCBuiltin<"__builtin_ia32_pmovzxwq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v8i16_ty], + [IntrNoMem]>; +} + // Vector blend let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb256">, diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index fd2873ed01f..d05bb47a68c 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -178,9 +178,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.avx2.pbroadcast") || Name.startswith("x86.avx.vpermil.") || Name.startswith("x86.sse41.pmovsx") || - Name.startswith("x86.sse41.pmovzx") || - Name.startswith("x86.avx2.pmovsx") || - Name.startswith("x86.avx2.pmovzx") || Name == "x86.sse2.cvtdq2pd" || Name == "x86.sse2.cvtps2pd" || Name == "x86.avx.cvtdq2.pd.256" || @@ -547,25 +544,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { for (unsigned I = 0; I < EltNum; ++I) Rep = Builder.CreateInsertElement(Rep, Load, ConstantInt::get(I32Ty, I)); - } else if (Name.startswith("llvm.x86.sse41.pmovsx") || - Name.startswith("llvm.x86.sse41.pmovzx") || - Name.startswith("llvm.x86.avx2.pmovsx") || - Name.startswith("llvm.x86.avx2.pmovzx")) { + } else if (Name.startswith("llvm.x86.sse41.pmovsx")) { VectorType *SrcTy = cast(CI->getArgOperand(0)->getType()); VectorType *DstTy = cast(CI->getType()); unsigned NumDstElts = DstTy->getNumElements(); - // Extract a subvector of the first NumDstElts lanes and sign/zero extend. + // Extract a subvector of the first NumDstElts lanes and sign extend. SmallVector ShuffleMask; for (int i = 0; i != (int)NumDstElts; ++i) ShuffleMask.push_back(i); Value *SV = Builder.CreateShuffleVector( CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask); - - bool DoSext = (StringRef::npos != Name.find("pmovsx")); - Rep = DoSext ? Builder.CreateSExt(SV, DstTy) - : Builder.CreateZExt(SV, DstTy); + Rep = Builder.CreateSExt(SV, DstTy); } else if (Name == "llvm.x86.avx2.vbroadcasti128") { // Replace vbroadcasts with a vector shuffle. Type *VT = VectorType::get(Type::getInt64Ty(C), 2); diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 6d8f254b5c4..93d1e94d1ca 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -361,6 +361,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), @@ -2276,6 +2288,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0), X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0), diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index c87d0466a9d..c07ca85d3a4 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -325,6 +325,24 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, return Builder.CreateAShr(Vec, ShiftVec); } +static Value *simplifyX86extend(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder, + bool SignExtend) { + VectorType *SrcTy = cast(II.getArgOperand(0)->getType()); + VectorType *DstTy = cast(II.getType()); + unsigned NumDstElts = DstTy->getNumElements(); + + // Extract a subvector of the first NumDstElts lanes and sign/zero extend. + SmallVector ShuffleMask; + for (int i = 0; i != (int)NumDstElts; ++i) + ShuffleMask.push_back(i); + + Value *SV = Builder.CreateShuffleVector(II.getArgOperand(0), + UndefValue::get(SrcTy), ShuffleMask); + return SignExtend ? Builder.CreateSExt(SV, DstTy) + : Builder.CreateZExt(SV, DstTy); +} + static Value *simplifyX86insertps(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { auto *CInt = dyn_cast(II.getArgOperand(2)); @@ -1631,6 +1649,32 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::x86_avx2_pmovsxbd: + case Intrinsic::x86_avx2_pmovsxbq: + case Intrinsic::x86_avx2_pmovsxbw: + case Intrinsic::x86_avx2_pmovsxdq: + case Intrinsic::x86_avx2_pmovsxwd: + case Intrinsic::x86_avx2_pmovsxwq: + if (Value *V = simplifyX86extend(*II, *Builder, true)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse41_pmovzxbd: + case Intrinsic::x86_sse41_pmovzxbq: + case Intrinsic::x86_sse41_pmovzxbw: + case Intrinsic::x86_sse41_pmovzxdq: + case Intrinsic::x86_sse41_pmovzxwd: + case Intrinsic::x86_sse41_pmovzxwq: + case Intrinsic::x86_avx2_pmovzxbd: + case Intrinsic::x86_avx2_pmovzxbq: + case Intrinsic::x86_avx2_pmovzxbw: + case Intrinsic::x86_avx2_pmovzxdq: + case Intrinsic::x86_avx2_pmovzxwd: + case Intrinsic::x86_avx2_pmovzxwq: + if (Value *V = simplifyX86extend(*II, *Builder, false)) + return replaceInstUsesWith(*II, V); + break; + case Intrinsic::x86_sse41_insertps: if (Value *V = simplifyX86insertps(*II, *Builder)) return replaceInstUsesWith(*II, V); diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index 8d03784ce1b..862e9378afe 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -247,72 +247,6 @@ define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) { declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone -define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxbd: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxbq: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxbw: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: retl - %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxdq: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone - - -define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxwd: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxwq: -; CHECK: ## BB#0: -; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone - - define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) { ; AVX-LABEL: test_x86_sse2_cvtdq2pd: ; AVX: ## BB#0: diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 84f8f3cd150..ac8be0fa2e2 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512vl,aes,pclmul | FileCheck %s --check-prefix=AVX512VL @@ -1800,6 +1800,102 @@ define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) { declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone +define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) { +; AVX-LABEL: test_x86_sse41_pmovzxbd: +; AVX: ## BB#0: +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_sse41_pmovzxbd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512VL-NEXT: retl + %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) { +; AVX-LABEL: test_x86_sse41_pmovzxbq: +; AVX: ## BB#0: +; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_sse41_pmovzxbq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) { +; AVX-LABEL: test_x86_sse41_pmovzxbw: +; AVX: ## BB#0: +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_sse41_pmovzxbw: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512VL-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) { +; AVX-LABEL: test_x86_sse41_pmovzxdq: +; AVX: ## BB#0: +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_sse41_pmovzxdq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512VL-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone + + +define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) { +; AVX-LABEL: test_x86_sse41_pmovzxwd: +; AVX: ## BB#0: +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_sse41_pmovzxwd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-NEXT: retl + %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) { +; AVX-LABEL: test_x86_sse41_pmovzxwq: +; AVX: ## BB#0: +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: retl +; +; AVX512VL-LABEL: test_x86_sse41_pmovzxwq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512VL-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone + + define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ; AVX-LABEL: test_x86_sse41_pmuldq: ; AVX: ## BB#0: @@ -4030,7 +4126,7 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { ; AVX512VL-LABEL: test_x86_avx_storeu_dq_256: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddb LCPI225_0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb LCPI231_0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqu %ymm0, (%eax) ; AVX512VL-NEXT: retl %a2 = add <32 x i8> %a1, @@ -4271,7 +4367,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) { ; ; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpermilpd LCPI239_0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpermilpd LCPI245_0, %ymm0, %ymm0 ; AVX512VL-NEXT: retl %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> ) ; <<4 x double>> [#uses=1] ret <4 x double> %res @@ -4763,7 +4859,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind { ; AVX-LABEL: movnt_dq: ; AVX: ## BB#0: ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vpaddq LCPI266_0, %xmm0, %xmm0 +; AVX-NEXT: vpaddq LCPI272_0, %xmm0, %xmm0 ; AVX-NEXT: vmovntdq %ymm0, (%eax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retl @@ -4771,7 +4867,7 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind { ; AVX512VL-LABEL: movnt_dq: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddq LCPI266_0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq LCPI272_0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ; AVX512VL-NEXT: retl %a2 = add <2 x i64> %a1, diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index aad7e8b5fbf..684412e38e8 100644 --- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -740,10 +740,11 @@ define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) { ; X64-NEXT: vpmovsxbw %xmm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %ext = sext <16 x i8> %arg0 to <16 x i16> - %res = bitcast <16 x i16> %ext to <4 x i64> + %call = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %arg0) + %res = bitcast <16 x i16> %call to <4 x i64> ret <4 x i64> %res } +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepi8_epi32: @@ -756,11 +757,11 @@ define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) { ; X64-NEXT: vpmovsxbd %xmm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> - %ext = sext <8 x i8> %shuf to <8 x i32> - %res = bitcast <8 x i32> %ext to <4 x i64> + %call = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %arg0) + %res = bitcast <8 x i32> %call to <4 x i64> ret <4 x i64> %res } +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepi8_epi64: @@ -773,10 +774,10 @@ define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) { ; X64-NEXT: vpmovsxbq %xmm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> - %ext = sext <4 x i8> %shuf to <4 x i64> - ret <4 x i64> %ext + %call = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %arg0) + ret <4 x i64> %call } +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepi16_epi32: @@ -789,10 +790,11 @@ define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) { ; X64-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %ext = sext <8 x i16> %arg0 to <8 x i32> - %res = bitcast <8 x i32> %ext to <4 x i64> + %call = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %arg0) + %res = bitcast <8 x i32> %call to <4 x i64> ret <4 x i64> %res } +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepi16_epi64: @@ -805,10 +807,10 @@ define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) { ; X64-NEXT: vpmovsxwq %xmm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> - %ext = sext <4 x i16> %shuf to <4 x i64> - ret <4 x i64> %ext + %call = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %arg0) + ret <4 x i64> %call } +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepi32_epi64: @@ -821,9 +823,10 @@ define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) { ; X64-NEXT: vpmovsxdq %xmm0, %ymm0 ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <4 x i32> - %ext = sext <4 x i32> %arg0 to <4 x i64> - ret <4 x i64> %ext + %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %arg0) + ret <4 x i64> %res } +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepu8_epi16: @@ -836,10 +839,11 @@ define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) { ; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %ext = zext <16 x i8> %arg0 to <16 x i16> - %res = bitcast <16 x i16> %ext to <4 x i64> + %call = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %arg0) + %res = bitcast <16 x i16> %call to <4 x i64> ret <4 x i64> %res } +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepu8_epi32: @@ -852,11 +856,11 @@ define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) { ; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> - %ext = zext <8 x i8> %shuf to <8 x i32> - %res = bitcast <8 x i32> %ext to <4 x i64> + %call = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %arg0) + %res = bitcast <8 x i32> %call to <4 x i64> ret <4 x i64> %res } +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepu8_epi64: @@ -869,10 +873,10 @@ define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) { ; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> - %ext = zext <4 x i8> %shuf to <4 x i64> - ret <4 x i64> %ext + %call = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %arg0) + ret <4 x i64> %call } +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepu16_epi32: @@ -885,10 +889,11 @@ define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) { ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %ext = zext <8 x i16> %arg0 to <8 x i32> - %res = bitcast <8 x i32> %ext to <4 x i64> + %call = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %arg0) + %res = bitcast <8 x i32> %call to <4 x i64> ret <4 x i64> %res } +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepu16_epi64: @@ -901,10 +906,10 @@ define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) { ; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> - %ext = zext <4 x i16> %shuf to <4 x i64> - ret <4 x i64> %ext + %call = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %arg0) + ret <4 x i64> %call } +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm256_cvtepu32_epi64: @@ -917,9 +922,10 @@ define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) { ; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <4 x i32> - %ext = zext <4 x i32> %arg0 to <4 x i64> - ret <4 x i64> %ext + %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %arg0) + ret <4 x i64> %res } +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind { ; X32-LABEL: test_mm256_extracti128_si256: diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll index 95f18610585..36b6da5ef96 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -203,99 +203,3 @@ define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) { ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly - - -define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) { -; CHECK: vpmovsxbd - %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) { -; CHECK: vpmovsxbq - %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) { -; CHECK: vpmovsxbw - %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) { -; CHECK: vpmovsxdq - %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone - - -define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) { -; CHECK: vpmovsxwd - %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) { -; CHECK: vpmovsxwq - %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone - - -define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) { -; CHECK: vpmovzxbd - %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) { -; CHECK: vpmovzxbq - %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) { -; CHECK: vpmovzxbw - %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) { -; CHECK: vpmovzxdq - %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone - - -define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) { -; CHECK: vpmovzxwd - %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone - - -define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) { -; CHECK: vpmovzxwq - %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index b5c4dbcb777..8e96df2cab5 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=AVX2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512vl | FileCheck %s --check-prefix=AVX512VL @@ -1077,6 +1078,198 @@ define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) { declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone +define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovsxbd: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovsxbd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovsxbd %xmm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovsxbq: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovsxbq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovsxbq %xmm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovsxbw: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovsxbw: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovsxdq: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovsxdq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovsxwd: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovsxwd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovsxwq: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovsxwq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX512VL-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovzxbd: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovzxbd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512VL-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovzxbq: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovzxbq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovzxbw: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovzxbw: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VL-NEXT: retl + %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovzxdq: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovzxdq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512VL-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovzxwd: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovzxwd: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512VL-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) { +; AVX2-LABEL: test_x86_avx2_pmovzxwq: +; AVX2: ## BB#0: +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: retl +; +; AVX512VL-LABEL: test_x86_avx2_pmovzxwq: +; AVX512VL: ## BB#0: +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512VL-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone + + define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) { %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<2 x i64>> [#uses=1] ret <4 x i64> %res @@ -1481,7 +1674,7 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { ; AVX2-LABEL: test_x86_avx_storeu_dq_256: ; AVX2: ## BB#0: ; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX2-NEXT: vpaddb LCPI91_0, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb LCPI103_0, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%eax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retl @@ -1489,7 +1682,7 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) { ; AVX512VL-LABEL: test_x86_avx_storeu_dq_256: ; AVX512VL: ## BB#0: ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX512VL-NEXT: vpaddb LCPI91_0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb LCPI103_0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqu %ymm0, (%eax) ; AVX512VL-NEXT: retl %a2 = add <32 x i8> %a1, diff --git a/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll b/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll index f281bbaa675..6bd6a5041d4 100644 --- a/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll +++ b/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll @@ -1,10 +1,10 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s -define <16 x i16> @test_llvm_x86_avx2_pmovsxbw(<16 x i8>* %a) { -; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbw +define <16 x i16> @test_lvm_x86_avx2_pmovsxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_lvm_x86_avx2_pmovsxbw ; CHECK: vpmovsxbw (%rdi), %ymm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = sext <16 x i8> %1 to <16 x i16> + %2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %1) ret <16 x i16> %2 } @@ -12,25 +12,23 @@ define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbd ; CHECK: vpmovsxbd (%rdi), %ymm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> - %3 = sext <8 x i8> %2 to <8 x i32> - ret <8 x i32> %3 + %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %1) + ret <8 x i32> %2 } define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbq ; CHECK: vpmovsxbq (%rdi), %ymm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> - %3 = sext <4 x i8> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %1) + ret <4 x i64> %2 } define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwd ; CHECK: vpmovsxwd (%rdi), %ymm0 %1 = load <8 x i16>, <8 x i16>* %a, align 1 - %2 = sext <8 x i16> %1 to <8 x i32> + %2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %1) ret <8 x i32> %2 } @@ -38,24 +36,23 @@ define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwq ; CHECK: vpmovsxwq (%rdi), %ymm0 %1 = load <8 x i16>, <8 x i16>* %a, align 1 - %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> - %3 = sext <4 x i16> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %1) + ret <4 x i64> %2 } define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovsxdq ; CHECK: vpmovsxdq (%rdi), %ymm0 %1 = load <4 x i32>, <4 x i32>* %a, align 1 - %2 = sext <4 x i32> %1 to <4 x i64> + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %1) ret <4 x i64> %2 } -define <16 x i16> @test_llvm_x86_avx2_pmovzxbw(<16 x i8>* %a) { -; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbw +define <16 x i16> @test_lvm_x86_avx2_pmovzxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_lvm_x86_avx2_pmovzxbw ; CHECK: vpmovzxbw (%rdi), %ymm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = zext <16 x i8> %1 to <16 x i16> + %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %1) ret <16 x i16> %2 } @@ -63,25 +60,23 @@ define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbd ; CHECK: vpmovzxbd (%rdi), %ymm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> - %3 = zext <8 x i8> %2 to <8 x i32> - ret <8 x i32> %3 + %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %1) + ret <8 x i32> %2 } define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbq ; CHECK: vpmovzxbq (%rdi), %ymm0 %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> - %3 = zext <4 x i8> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %1) + ret <4 x i64> %2 } define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwd ; CHECK: vpmovzxwd (%rdi), %ymm0 %1 = load <8 x i16>, <8 x i16>* %a, align 1 - %2 = zext <8 x i16> %1 to <8 x i32> + %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %1) ret <8 x i32> %2 } @@ -89,15 +84,27 @@ define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwq ; CHECK: vpmovzxwq (%rdi), %ymm0 %1 = load <8 x i16>, <8 x i16>* %a, align 1 - %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> - %3 = zext <4 x i16> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %1) + ret <4 x i64> %2 } define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) { ; CHECK-LABEL: test_llvm_x86_avx2_pmovzxdq ; CHECK: vpmovzxdq (%rdi), %ymm0 %1 = load <4 x i32>, <4 x i32>* %a, align 1 - %2 = zext <4 x i32> %1 to <4 x i64> + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %1) ret <4 x i64> %2 } + +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) diff --git a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll index 7bfce0941a2..03137ceaac0 100644 --- a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -301,11 +301,11 @@ define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) { ; X64-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> - %sext = zext <8 x i8> %ext0 to <8 x i16> - %res = bitcast <8 x i16> %sext to <2 x i64> + %zext = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %arg0) + %res = bitcast <8 x i16> %zext to <2 x i64> ret <2 x i64> %res } +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) { ; X32-LABEL: test_mm_cvtepu8_epi32: @@ -318,11 +318,11 @@ define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) { ; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> - %sext = zext <4 x i8> %ext0 to <4 x i32> - %res = bitcast <4 x i32> %sext to <2 x i64> + %zext = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %arg0) + %res = bitcast <4 x i32> %zext to <2 x i64> ret <2 x i64> %res } +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm_cvtepu8_epi64: @@ -335,10 +335,10 @@ define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) { ; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <16 x i8> - %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> - %sext = zext <2 x i8> %ext0 to <2 x i64> - ret <2 x i64> %sext + %zext = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %arg0) + ret <2 x i64> %zext } +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) { ; X32-LABEL: test_mm_cvtepu16_epi32: @@ -351,11 +351,11 @@ define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) { ; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> - %sext = zext <4 x i16> %ext0 to <4 x i32> - %res = bitcast <4 x i32> %sext to <2 x i64> + %zext = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %arg0) + %res = bitcast <4 x i32> %zext to <2 x i64> ret <2 x i64> %res } +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm_cvtepu16_epi64: @@ -368,10 +368,10 @@ define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) { ; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> - %sext = zext <2 x i16> %ext0 to <2 x i64> - ret <2 x i64> %sext + %zext = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %arg0) + ret <2 x i64> %zext } +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) { ; X32-LABEL: test_mm_cvtepu32_epi64: @@ -384,10 +384,10 @@ define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) { ; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: retq %arg0 = bitcast <2 x i64> %a0 to <4 x i32> - %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> - %sext = zext <2 x i32> %ext0 to <2 x i64> - ret <2 x i64> %sext + %zext = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %arg0) + ret <2 x i64> %zext } +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) { ; X32-LABEL: test_mm_dp_pd: diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll index 72bf4395bb9..2c3c02baf97 100644 --- a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -145,69 +145,3 @@ define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) { ret <2 x i64> %res } declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone - - -define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxbd: -; CHECK: ## BB#0: -; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxbq: -; CHECK: ## BB#0: -; CHECK-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxbw: -; CHECK: ## BB#0: -; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-NEXT: retl - %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxdq: -; CHECK: ## BB#0: -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone - - -define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxwd: -; CHECK: ## BB#0: -; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: retl - %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone - - -define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) { -; CHECK-LABEL: test_x86_sse41_pmovzxwq: -; CHECK: ## BB#0: -; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; CHECK-NEXT: retl - %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll index b8d058cc12e..6b4ea6b7c20 100644 --- a/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -284,6 +284,102 @@ define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) { declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone +define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) { +; SSE41-LABEL: test_x86_sse41_pmovzxbd: +; SSE41: ## BB#0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: retl +; +; KNL-LABEL: test_x86_sse41_pmovzxbd: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; KNL-NEXT: retl + %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) { +; SSE41-LABEL: test_x86_sse41_pmovzxbq: +; SSE41: ## BB#0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retl +; +; KNL-LABEL: test_x86_sse41_pmovzxbq: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; KNL-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) { +; SSE41-LABEL: test_x86_sse41_pmovzxbw: +; SSE41: ## BB#0: +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: retl +; +; KNL-LABEL: test_x86_sse41_pmovzxbw: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: retl + %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) { +; SSE41-LABEL: test_x86_sse41_pmovzxdq: +; SSE41: ## BB#0: +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: retl +; +; KNL-LABEL: test_x86_sse41_pmovzxdq: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; KNL-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone + + +define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) { +; SSE41-LABEL: test_x86_sse41_pmovzxwd: +; SSE41: ## BB#0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: retl +; +; KNL-LABEL: test_x86_sse41_pmovzxwd: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; KNL-NEXT: retl + %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone + + +define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) { +; SSE41-LABEL: test_x86_sse41_pmovzxwq: +; SSE41: ## BB#0: +; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE41-NEXT: retl +; +; KNL-LABEL: test_x86_sse41_pmovzxwq: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; KNL-NEXT: retl + %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone + + define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE41-LABEL: test_x86_sse41_pmuldq: ; SSE41: ## BB#0: diff --git a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll b/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll index 756beb995c0..a7e48d8ac03 100644 --- a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll +++ b/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll @@ -109,9 +109,8 @@ define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) { ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <8 x i32> - %3 = zext <8 x i8> %2 to <8 x i16> - ret <8 x i16> %3 + %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %1) + ret <8 x i16> %2 } define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) { @@ -125,9 +124,8 @@ define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) { ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <4 x i32> - %3 = zext <4 x i8> %2 to <4 x i32> - ret <4 x i32> %3 + %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %1) + ret <4 x i32> %2 } define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) { @@ -141,9 +139,8 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) { ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %a, align 1 - %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <2 x i32> - %3 = zext <2 x i8> %2 to <2 x i64> - ret <2 x i64> %3 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %1) + ret <2 x i64> %2 } define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) { @@ -157,9 +154,8 @@ define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) { ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a, align 1 - %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> - %3 = zext <4 x i16> %2 to <4 x i32> - ret <4 x i32> %3 + %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %1) + ret <4 x i32> %2 } define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) { @@ -173,9 +169,8 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) { ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero ; AVX-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a, align 1 - %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <2 x i32> - %3 = zext <2 x i16> %2 to <2 x i64> - ret <2 x i64> %3 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %1) + ret <2 x i64> %2 } define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) { @@ -189,7 +184,13 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) { ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; AVX-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %a, align 1 - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <2 x i32> - %3 = zext <2 x i32> %2 to <2 x i64> - ret <2 x i64> %3 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %1) + ret <2 x i64> %2 } + +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll index ef7fa221714..897c749cc0e 100644 --- a/test/CodeGen/X86/stack-folding-int-avx2.ll +++ b/test/CodeGen/X86/stack-folding-int-avx2.ll @@ -662,19 +662,19 @@ define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { ;CHECK-LABEL: stack_fold_pmovsxbd ;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> - %3 = sext <8 x i8> %2 to <8 x i32> - ret <8 x i32> %3 + %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) + ret <8 x i32> %2 } +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { ;CHECK-LABEL: stack_fold_pmovsxbq ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> - %3 = sext <4 x i8> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) + ret <4 x i64> %2 } +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { ;CHECK-LABEL: stack_fold_pmovsxbw @@ -704,61 +704,64 @@ define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { ;CHECK-LABEL: stack_fold_pmovsxwq ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> - %3 = sext <4 x i16> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) + ret <4 x i64> %2 } +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { ;CHECK-LABEL: stack_fold_pmovzxbd ;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> - %3 = zext <8 x i8> %2 to <8 x i32> - ret <8 x i32> %3 + %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) + ret <8 x i32> %2 } +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { ;CHECK-LABEL: stack_fold_pmovzxbq ;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> - %3 = zext <4 x i8> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) + ret <4 x i64> %2 } +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { ;CHECK-LABEL: stack_fold_pmovzxbw ;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = zext <16 x i8> %a0 to <16 x i16> + %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ret <16 x i16> %2 } +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { ;CHECK-LABEL: stack_fold_pmovzxdq ;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = zext <4 x i32> %a0 to <4 x i64> + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ret <4 x i64> %2 } +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { ;CHECK-LABEL: stack_fold_pmovzxwd ;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = zext <8 x i16> %a0 to <8 x i32> + %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ret <8 x i32> %2 } +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { ;CHECK-LABEL: stack_fold_pmovzxwq ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> - %3 = zext <4 x i16> %2 to <4 x i64> - ret <4 x i64> %3 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) + ret <4 x i64> %2 } +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) { ;CHECK-LABEL: stack_fold_pmuldq diff --git a/test/Transforms/InstCombine/x86-pmovsx.ll b/test/Transforms/InstCombine/x86-pmovsx.ll new file mode 100644 index 00000000000..52cf4124210 --- /dev/null +++ b/test/Transforms/InstCombine/x86-pmovsx.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone + +; +; Basic sign extension tests +; + +define <8 x i32> @avx2_pmovsxbd(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxbd( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %v) + ret <8 x i32> %res +} + +define <4 x i64> @avx2_pmovsxbq(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxbq( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %v) + ret <4 x i64> %res +} + +define <16 x i16> @avx2_pmovsxbw(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxbw( +; CHECK-NEXT: [[TMP1:%.*]] = sext <16 x i8> %v to <16 x i16> +; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; + %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %v) + ret <16 x i16> %res +} + +define <4 x i64> @avx2_pmovsxdq(<4 x i32> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxdq( +; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i32> %v to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %v) + ret <4 x i64> %res +} + +define <8 x i32> @avx2_pmovsxwd(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxwd( +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> %v to <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; + %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %v) + ret <8 x i32> %res +} + +define <4 x i64> @avx2_pmovsxwq(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovsxwq( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %v) + ret <4 x i64> %res +} diff --git a/test/Transforms/InstCombine/x86-pmovzx.ll b/test/Transforms/InstCombine/x86-pmovzx.ll new file mode 100644 index 00000000000..1853692d85b --- /dev/null +++ b/test/Transforms/InstCombine/x86-pmovzx.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone + +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone + +; +; Basic zero extension tests +; + +define <4 x i32> @sse41_pmovzxbd(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxbd( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %v) + ret <4 x i32> %res +} + +define <2 x i64> @sse41_pmovzxbq(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxbq( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i8> [[TMP1]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %v) + ret <2 x i64> %res +} + +define <8 x i16> @sse41_pmovzxbw(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxbw( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[TMP2]] +; + %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %v) + ret <8 x i16> %res +} + +define <2 x i64> @sse41_pmovzxdq(<4 x i32> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxdq( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %v) + ret <2 x i64> %res +} + +define <4 x i32> @sse41_pmovzxwd(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxwd( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %v) + ret <4 x i32> %res +} + +define <2 x i64> @sse41_pmovzxwq(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @sse41_pmovzxwq( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> %v, <8 x i16> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; + %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %v) + ret <2 x i64> %res +} + +define <8 x i32> @avx2_pmovzxbd(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxbd( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %v) + ret <8 x i32> %res +} + +define <4 x i64> @avx2_pmovzxbq(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxbq( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> %v, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %v) + ret <4 x i64> %res +} + +define <16 x i16> @avx2_pmovzxbw(<16 x i8> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxbw( +; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> %v to <16 x i16> +; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; + %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %v) + ret <16 x i16> %res +} + +define <4 x i64> @avx2_pmovzxdq(<4 x i32> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxdq( +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> %v to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[TMP1]] +; + %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %v) + ret <4 x i64> %res +} + +define <8 x i32> @avx2_pmovzxwd(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxwd( +; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i16> %v to <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP1]] +; + %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %v) + ret <8 x i32> %res +} + +define <4 x i64> @avx2_pmovzxwq(<8 x i16> %v) nounwind readnone { +; CHECK-LABEL: @avx2_pmovzxwq( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; + %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %v) + ret <4 x i64> %res +}