From 5ea7215050818473565584b76bdf53c862754a79 Mon Sep 17 00:00:00 2001 From: Filipe Cabecinhas Date: Fri, 16 May 2014 22:47:49 +0000 Subject: [PATCH] Lower vselects into X86ISD::BLENDI when appropriate. LowerVSELECT will, if possible, generate a X86ISD::BLENDI DAG node if the condition is constant and we can emit that instruction, given the subtarget. This is not enough for all cases. An additional SELECTCombine optimization will be committed. Fixed tests that were expecting variable blends but where a blend+imm can be generated. Added test where we can't emit blend+immediate. Added avx2 blend+imm tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209043 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 84 +++++++++++++++++++++++++++++++++++++- test/CodeGen/X86/avx-blend.ll | 10 ++--- test/CodeGen/X86/avx2.ll | 25 ++++++++++++ test/CodeGen/X86/blend-msb.ll | 12 ++---- test/CodeGen/X86/sse41-blend.ll | 8 ++-- test/CodeGen/X86/sse41.ll | 8 ++++ 6 files changed, 128 insertions(+), 19 deletions(-) create mode 100644 test/CodeGen/X86/avx2.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 860af1284b2..c6e7730f827 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7971,7 +7971,87 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// This function assumes its argument is a BUILD_VECTOR of constand or +// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is +// true. +static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, + unsigned &MaskValue) { + MaskValue = 0; + unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + unsigned NumLanes = (NumElems - 1) / 8 + 1; + unsigned NumElemsInLane = NumElems / NumLanes; + + // Blend for v16i16 should be symetric for the both lanes. + for (unsigned i = 0; i < NumElemsInLane; ++i) { + SDValue EltCond = BuildVector->getOperand(i); + SDValue SndLaneEltCond = + (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond; + + int Lane1Cond = -1, Lane2Cond = -1; + if (isa(EltCond)) + Lane1Cond = !isZero(EltCond); + if (isa(SndLaneEltCond)) + Lane2Cond = !isZero(SndLaneEltCond); + + if (Lane1Cond == Lane2Cond || Lane2Cond < 0) + MaskValue |= !!Lane1Cond << i; + else if (Lane1Cond < 0) + MaskValue |= !!Lane2Cond << i; + else + return false; + } + return true; +} + +// Try to lower a vselect node into a simple blend instruction. +static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + // There is no blend with immediate in AVX-512. + if (VT.is512BitVector()) + return SDValue(); + + if (!Subtarget->hasSSE41() || EltVT == MVT::i8) + return SDValue(); + if (!Subtarget->hasInt256() && VT == MVT::v16i16) + return SDValue(); + + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + return SDValue(); + + // Check the mask for BLEND and build the value. + unsigned MaskValue = 0; + if (!BUILD_VECTORtoBlendMask(cast(Cond), MaskValue)) + return SDValue(); + + // Convert i32 vectors to floating point if it is not AVX2. + // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. + MVT BlendVT = VT; + if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { + BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), + NumElems); + LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS); + } + + SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS, + DAG.getConstant(MaskValue, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Ret); +} + SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { + SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + if (BlendOp.getNode()) + return BlendOp; + // Some types for vselect were previously set to Expand, not Legal or // Custom. Return an empty SDValue so we fall-through to Expand, after // the Custom lowering phase. @@ -7984,7 +8064,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } - // This node is Legal. + // We couldn't create a "Blend with immediate" node. + // This node should still be legal, but we'll have to emit a blendv* + // instruction. return Op; } diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll index 5fcd5ff5f4c..4757ce0c9fa 100644 --- a/test/CodeGen/X86/avx-blend.ll +++ b/test/CodeGen/X86/avx-blend.ll @@ -3,7 +3,7 @@ ; AVX128 tests: ;CHECK-LABEL: vsel_float: -;CHECK: vblendvps +;CHECK: vblendps $5 ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 @@ -12,7 +12,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { ;CHECK-LABEL: vsel_i32: -;CHECK: vblendvps +;CHECK: vblendps $5 ;CHECK: ret define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 @@ -52,7 +52,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ;CHECK-LABEL: vsel_float8: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvps +;CHECK: vblendps $17 ;CHECK: ret define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { %vsel = select <8 x i1> , <8 x float> %v1, <8 x float> %v2 @@ -61,7 +61,7 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { ;CHECK-LABEL: vsel_i328: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvps +;CHECK: vblendps $17 ;CHECK-NEXT: ret define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { %vsel = select <8 x i1> , <8 x i32> %v1, <8 x i32> %v2 @@ -86,7 +86,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { ;CHECK-LABEL: vsel_double4: ;CHECK-NOT: vinsertf128 -;CHECK: vblendvpd +;CHECK: vblendpd $5 ;CHECK-NEXT: ret define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2 diff --git a/test/CodeGen/X86/avx2.ll b/test/CodeGen/X86/avx2.ll new file mode 100644 index 00000000000..290d0b6557b --- /dev/null +++ b/test/CodeGen/X86/avx2.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s + +define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @blendvb_fallback_v4i32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y + ret <4 x i32> %ret +} + +define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) { +; CHECK-LABEL: @blendvb_fallback_v8i32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y + ret <8 x i32> %ret +} + +define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) { +; CHECK-LABEL: @blendvb_fallback_v8f32 +; CHECK: vblendvps +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y + ret <8 x float> %ret +} diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll index 6b465963292..4e17a714bf5 100644 --- a/test/CodeGen/X86/blend-msb.ll +++ b/test/CodeGen/X86/blend-msb.ll @@ -4,7 +4,7 @@ ; Verify that we produce movss instead of blendvps when possible. ;CHECK-LABEL: vsel_float: -;CHECK-NOT: blendvps +;CHECK-NOT: blend ;CHECK: movss ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { @@ -13,7 +13,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { } ;CHECK-LABEL: vsel_4xi8: -;CHECK-NOT: blendvps +;CHECK-NOT: blend ;CHECK: movss ;CHECK: ret define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { @@ -21,14 +21,8 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { ret <4 x i8> %vsel } - -; We do not have native support for v8i16 blends and we have to use the -; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not -; reduce the mask in this case. ;CHECK-LABEL: vsel_8xi16: -;CHECK: andps -;CHECK: andps -;CHECK: orps +;CHECK: pblendw $17 ;CHECK: ret define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { %vsel = select <8 x i1> , <8 x i16> %v1, <8 x i16> %v2 diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll index 4681fde7548..951bb7dc854 100644 --- a/test/CodeGen/X86/sse41-blend.ll +++ b/test/CodeGen/X86/sse41-blend.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s ;CHECK-LABEL: vsel_float: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { %vsel = select <4 x i1> , <4 x float> %v1, <4 x float> %v2 @@ -10,7 +10,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { ;CHECK-LABEL: vsel_4xi8: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 @@ -18,7 +18,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { } ;CHECK-LABEL: vsel_4xi16: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { %vsel = select <4 x i1> , <4 x i16> %v1, <4 x i16> %v2 @@ -27,7 +27,7 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { ;CHECK-LABEL: vsel_i32: -;CHECK: blendvps +;CHECK: blendps ;CHECK: ret define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2 diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index db0d9c5c116..3652d8c0d02 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -576,3 +576,11 @@ define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 ret <4 x float> %res } + +define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { +; CHECK-LABEL: blendvb_fallback +; CHECK: blendvb +; CHECK: ret + %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y + ret <8 x i16> %ret +} -- 2.11.0