From b19c087aa7efac4bd11f0d49a5c37b09e1c4708e Mon Sep 17 00:00:00 2001 From: Filipe Cabecinhas Date: Thu, 8 May 2014 00:25:16 +0000 Subject: [PATCH] Lower certain build_vectors to insertps instructions Summary: Vectors built with zeros and elements in the same order as another (source) vector are optimized to be built using a single insertps instruction. Also optimize when we move one element in a vector to a different place in that vector while zeroing out some of the other elements. Further optimizations are possible, described in TODO comments. I will be implementing at least some of them in the near future. Added some tests for different cases where this optimization triggers. Reviewers: nadav, delena, craig.topper Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3521 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208271 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 76 +++++++++++ test/CodeGen/X86/sse41.ll | 256 +++++++++++++++++++++++++++++++++++++ 2 files changed, 332 insertions(+) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 380017feff6..5355053241a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5437,6 +5437,74 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return V; } +/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, + unsigned NonZeros, unsigned NumNonZero, + unsigned NumZero, SelectionDAG &DAG, + const X86Subtarget *Subtarget, + const TargetLowering &TLI) { + // We know there's at least one non-zero element + unsigned FirstNonZeroIdx = 0; + SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); + while (FirstNonZero.getOpcode() == ISD::UNDEF || + X86::isZeroNode(FirstNonZero)) { + ++FirstNonZeroIdx; + FirstNonZero = Op->getOperand(FirstNonZeroIdx); + } + + if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(FirstNonZero.getOperand(1))) + return SDValue(); + + SDValue V = FirstNonZero.getOperand(0); + unsigned FirstNonZeroDst = cast(FirstNonZero.getOperand(1))->getZExtValue(); + unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; + unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; + unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; + + for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { + SDValue Elem = Op.getOperand(Idx); + if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) + continue; + + // TODO: What else can be here? Deal with it. + if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // TODO: Some optimizations are still possible here + // ex: Getting one element from a vector, and the rest from another. + if (Elem.getOperand(0) != V) + return SDValue(); + + unsigned Dst = cast(Elem.getOperand(1))->getZExtValue(); + if (Dst == Idx) + ++CorrectIdx; + else if (IncorrectIdx == -1U) { + IncorrectIdx = Idx; + IncorrectDst = Dst; + } else + // There was already one element with an incorrect index. + // We can't optimize this case to an insertps. + return SDValue(); + } + + if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { + SDLoc dl(Op); + EVT VT = Op.getSimpleValueType(); + unsigned ElementMoveMask = 0; + if (IncorrectIdx == -1U) + ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; + else + ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; + + SDValue InsertpsMask = DAG.getIntPtrConstant( + ElementMoveMask | (~NonZeros & 0xf)); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); + } + + return SDValue(); +} + /// getVShift - Return a vector logical shift node. /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, @@ -6187,6 +6255,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (V.getNode()) return V; } + // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS + if (EVTBits == 32 && NumElems == 4) { + SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, + NumZero, DAG, Subtarget, *this); + if (V.getNode()) + return V; + } + // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector V(NumElems); if (NumElems == 4 && NumZero > 0) { diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 8db97d9071a..db0d9c5c116 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -320,3 +320,259 @@ define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> ret <4 x i32> %result } + +;;;;;; Shuffles optimizable with a single insertps instruction +define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XYZ0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $8 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecext3 = extractelement <4 x float> %x, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 + %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 + ret <4 x float> %vecinit5 +} + +define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XY00: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $12 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XYY0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $104 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2 + %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 + ret <4 x float> %vecinit5 +} + +define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_XYW0: +; CHECK: insertps $232 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecext2 = extractelement <4 x float> %x, i32 3 + %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_W00W: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $198 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 3 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X00A: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 + %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X00X: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 + %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> + ret <4 x float> %vecinit4 +} + +define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { +; CHECK-LABEL: shuf_X0YC: +; CHECK: shufps +; CHECK-NOT: movhlps +; CHECK-NOT: shufps +; CHECK: insertps $176 +; CHECK: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 + %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> + %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> + ret <4 x float> %vecinit5 +} + +define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XYZ0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $8 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecext3 = extractelement <4 x i32> %x, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 + %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 + ret <4 x i32> %vecinit5 +} + +define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XY00: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $12 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XYY0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $104 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2 + %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 + ret <4 x i32> %vecinit5 +} + +define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_XYW0: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $232 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecext1 = extractelement <4 x i32> %x, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 + %vecext2 = extractelement <4 x i32> %x, i32 3 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_W00W: +; CHECK-NOT: pextrd +; CHECK-NOT: punpckldq +; CHECK: insertps $198 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 3 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 + %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3 + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X00A: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 + %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X00X: +; CHECK-NOT: movaps +; CHECK-NOT: shufps +; CHECK: insertps $48 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 + %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> + ret <4 x i32> %vecinit4 +} + +define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { +; CHECK-LABEL: i32_shuf_X0YC: +; CHECK: shufps +; CHECK-NOT: movhlps +; CHECK-NOT: shufps +; CHECK: insertps $176 +; CHECK: ret + %vecext = extractelement <4 x i32> %x, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 + %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> + %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> + ret <4 x i32> %vecinit5 +} + +;; Test for a bug in the first implementation of LowerBuildVectorv4x32 +define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { +; CHECK-LABEL: test_insertps_no_undef: +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: insertps $8, %xmm1, %xmm1 +; CHECK-NEXT: maxps %xmm1, %xmm0 +; CHECK-NEXT: ret + %vecext = extractelement <4 x float> %x, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %x, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecext3 = extractelement <4 x float> %x, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 + %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 + %mask = fcmp olt <4 x float> %vecinit5, %x + %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 + ret <4 x float> %res +} -- 2.11.0