From 91c642526e40b77f44a03b99b5d450cd293b78aa Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 3 Nov 2015 20:27:01 +0000 Subject: [PATCH] [X86][XOP] Add support for the matching of the VPCMOV bit select instruction XOP has the VPCMOV instruction that performs the common vector bit select operation OR( AND( SRC1, SRC3 ), AND( SRC2, ~SRC3 ) ) This patch adds tablegen pattern matching for this instruction. Differential Revision: http://reviews.llvm.org/D8841 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251975 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/IR/AutoUpgrade.cpp | 11 ++ lib/Target/X86/X86InstrXOP.td | 10 ++ test/CodeGen/X86/xop-intrinsics-x86_64.ll | 5 +- test/CodeGen/X86/xop-pcmov.ll | 162 ++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/X86/xop-pcmov.ll diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 9de149559df..12c354c89b2 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -200,6 +200,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "x86.avx2.pblendd.128" || Name == "x86.avx2.pblendd.256" || Name == "x86.avx2.vbroadcasti128" || + Name == "x86.xop.vpcmov" || (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) { NewFn = nullptr; return true; @@ -457,6 +458,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1), Builder.getInt8(Imm)}); + } else if (Name == "llvm.x86.xop.vpcmov") { + Value *Arg0 = CI->getArgOperand(0); + Value *Arg1 = CI->getArgOperand(1); + Value *Sel = CI->getArgOperand(2); + unsigned NumElts = CI->getType()->getVectorNumElements(); + Constant *MinusOne = ConstantVector::getSplat(NumElts, Builder.getInt64(-1)); + Value *NotSel = Builder.CreateXor(Sel, MinusOne); + Value *Sel0 = Builder.CreateAnd(Arg0, Sel); + Value *Sel1 = Builder.CreateAnd(Arg1, NotSel); + Rep = Builder.CreateOr(Sel0, Sel1); } else if (Name == "llvm.x86.sse42.crc32.64.8") { Function *CRC32 = Intrinsic::getDeclaration(F->getParent(), Intrinsic::x86_sse42_crc32_32_8); diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index df9d906f4d9..4cb2304e464 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -281,6 +281,16 @@ multiclass xop4op256 opc, string OpcodeStr, Intrinsic Int> { let ExeDomain = SSEPackedInt in defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +let Predicates = [HasXOP] in { + def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>; +} + multiclass xop5op opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { def rr : IXOP5 @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { - ; CHECK: vpcmov + ; CHECK: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ; ret <2 x i64> %res } declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { - ; CHECK: vpcmov - ; CHECK: ymm + ; CHECK: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ; ret <4 x i64> %res } diff --git a/test/CodeGen/X86/xop-pcmov.ll b/test/CodeGen/X86/xop-pcmov.ll new file mode 100644 index 00000000000..165d4a7232d --- /dev/null +++ b/test/CodeGen/X86/xop-pcmov.ll @@ -0,0 +1,162 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s + +define <4 x double> @pcmov_4f64(<4 x double> %a, <4 x double> %b, <4 x double> %m) { +; CHECK-LABEL: pcmov_4f64: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = bitcast <4 x double> %m to <4 x i64> + %2 = bitcast <4 x double> %a to <4 x i64> + %3 = and <4 x i64> %1, %2 + %4 = xor <4 x i64> %1, + %5 = bitcast <4 x double> %b to <4 x i64> + %6 = and <4 x i64> %4, %5 + %7 = or <4 x i64> %3, %6 + %8 = bitcast <4 x i64> %7 to <4 x double> + ret <4 x double> %8 +} + +define <2 x double> @pcmov_2f64(<2 x double> %a, <2 x double> %b, <2 x double> %m) { +; CHECK-LABEL: pcmov_2f64: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = bitcast <2 x double> %m to <2 x i64> + %2 = bitcast <2 x double> %a to <2 x i64> + %3 = and <2 x i64> %1, %2 + %4 = xor <2 x i64> %1, + %5 = bitcast <2 x double> %b to <2 x i64> + %6 = and <2 x i64> %4, %5 + %7 = or <2 x i64> %3, %6 + %8 = bitcast <2 x i64> %7 to <2 x double> + ret <2 x double> %8 +} + +define <8 x float> @pcmov_8f32(<8 x float> %a, <8 x float> %b, <8 x float> %m) { +; CHECK-LABEL: pcmov_8f32: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = bitcast <8 x float> %m to <8 x i32> + %2 = bitcast <8 x float> %a to <8 x i32> + %3 = and <8 x i32> %1, %2 + %4 = xor <8 x i32> %1, + %5 = bitcast <8 x float> %b to <8 x i32> + %6 = and <8 x i32> %4, %5 + %7 = or <8 x i32> %3, %6 + %8 = bitcast <8 x i32> %7 to <8 x float> + ret <8 x float> %8 +} + +define <4 x float> @pcmov_4f32(<4 x float> %a, <4 x float> %b, <4 x float> %m) { +; CHECK-LABEL: pcmov_4f32: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = bitcast <4 x float> %m to <4 x i32> + %2 = bitcast <4 x float> %a to <4 x i32> + %3 = and <4 x i32> %1, %2 + %4 = xor <4 x i32> %1, + %5 = bitcast <4 x float> %b to <4 x i32> + %6 = and <4 x i32> %4, %5 + %7 = or <4 x i32> %3, %6 + %8 = bitcast <4 x i32> %7 to <4 x float> + ret <4 x float> %8 +} + +define <4 x i64> @pcmov_4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %m) { +; CHECK-LABEL: pcmov_4i64: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = and <4 x i64> %a, %m + %2 = xor <4 x i64> %m, + %3 = and <4 x i64> %b, %2 + %4 = or <4 x i64> %1, %3 + ret <4 x i64> %4 +} + +define <2 x i64> @pcmov_2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %m) { +; CHECK-LABEL: pcmov_2i64: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = and <2 x i64> %a, %m + %2 = xor <2 x i64> %m, + %3 = and <2 x i64> %b, %2 + %4 = or <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <8 x i32> @pcmov_8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %m) { +; CHECK-LABEL: pcmov_8i32: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = and <8 x i32> %a, %m + %2 = xor <8 x i32> %m, + %3 = and <8 x i32> %b, %2 + %4 = or <8 x i32> %1, %3 + ret <8 x i32> %4 +} + +define <4 x i32> @pcmov_4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %m) { +; CHECK-LABEL: pcmov_4i32: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = and <4 x i32> %a, %m + %2 = xor <4 x i32> %m, + %3 = and <4 x i32> %b, %2 + %4 = or <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <16 x i16> @pcmov_16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %m) { +; CHECK-LABEL: pcmov_16i16: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = and <16 x i16> %a, %m + %2 = xor <16 x i16> %m, + %3 = and <16 x i16> %b, %2 + %4 = or <16 x i16> %1, %3 + ret <16 x i16> %4 +} + +define <8 x i16> @pcmov_8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %m) { +; CHECK-LABEL: pcmov_8i16: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = and <8 x i16> %a, %m + %2 = xor <8 x i16> %m, + %3 = and <8 x i16> %b, %2 + %4 = or <8 x i16> %1, %3 + ret <8 x i16> %4 +} + +define <32 x i8> @pcmov_32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %m) { +; CHECK-LABEL: pcmov_32i8: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = and <32 x i8> %a, %m + %2 = xor <32 x i8> %m, + %3 = and <32 x i8> %b, %2 + %4 = or <32 x i8> %1, %3 + ret <32 x i8> %4 +} + +define <16 x i8> @pcmov_16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %m) { +; CHECK-LABEL: pcmov_16i8: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = and <16 x i8> %a, %m + %2 = xor <16 x i8> %m, + %3 = and <16 x i8> %b, %2 + %4 = or <16 x i8> %1, %3 + ret <16 x i8> %4 +} -- 2.11.0