From 2683baa8acbcfc44b94a7af781c43674d28d9a2e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 17 Mar 2014 18:58:11 +0000 Subject: [PATCH] R600: Match sign_extend_inreg to BFE instructions git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@204072 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 111 ++++++++++++++++ lib/Target/R600/AMDGPUISelLowering.h | 6 + lib/Target/R600/AMDGPUInstrInfo.td | 4 + lib/Target/R600/AMDGPUSubtarget.h | 9 ++ lib/Target/R600/AMDILISelLowering.cpp | 35 ----- lib/Target/R600/AMDILIntrinsics.td | 4 - lib/Target/R600/R600ISelLowering.cpp | 5 + lib/Target/R600/R600Instructions.td | 17 ++- lib/Target/R600/SIInstructions.td | 10 +- test/CodeGen/R600/mad_uint24.ll | 16 +-- test/CodeGen/R600/mul_uint24.ll | 16 +-- test/CodeGen/R600/sext-in-reg.ll | 236 +++++++++++++++++++++++++++++++++ test/CodeGen/R600/v1i64-kernel-arg.ll | 17 +++ 13 files changed, 416 insertions(+), 70 deletions(-) create mode 100644 test/CodeGen/R600/sext-in-reg.ll create mode 100644 test/CodeGen/R600/v1i64-kernel-arg.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 4e4b12eacc9..ddf251f38bf 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -211,6 +211,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); } + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); } //===----------------------------------------------------------------------===// @@ -927,6 +941,101 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, } +SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, + unsigned BitsDiff, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue Shift = DAG.getConstant(BitsDiff, VT); + // Shift left by 'Shift' bits. + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); + // Signed shift Right by 'Shift' bits. + return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); +} + +SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + EVT ExtraVT = cast(Op.getOperand(1))->getVT(); + MVT VT = Op.getSimpleValueType(); + MVT ScalarVT = VT.getScalarType(); + + unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits(); + unsigned DestBits = ScalarVT.getSizeInBits(); + unsigned BitsDiff = DestBits - SrcBits; + + if (!Subtarget->hasBFE()) + return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); + + SDValue Src = Op.getOperand(0); + if (VT.isVector()) { + SDLoc DL(Op); + // Need to scalarize this, and revisit each of the scalars later. + // TODO: Don't scalarize on Evergreen? + unsigned NElts = VT.getVectorNumElements(); + SmallVector Args; + ExtractVectorElements(Src, DAG, Args, 0, NElts); + + SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); + for (unsigned I = 0; I < NElts; ++I) + Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); + + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size()); + } + + if (SrcBits == 32) { + SDLoc DL(Op); + + // If the source is 32-bits, this is really half of a 2-register pair, and + // we need to discard the unused half of the pair. + SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src); + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc); + } + + unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1; + + // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it + // might not be worth the effort, and will need to expand to shifts when + // fixing SGPR copies. + if (SrcBits < 32 && DestBits <= 32) { + SDLoc DL(Op); + MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts); + + if (DestBits != 32) + Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src); + + // FIXME: This should use TargetConstant, but that hits assertions for + // Evergreen. + SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT, + Op.getOperand(0), // Operand + DAG.getConstant(0, ExtVT), // Offset + DAG.getConstant(SrcBits, ExtVT)); // Width + + // Truncate to the original type if necessary. + if (ScalarVT == MVT::i32) + return Ext; + return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext); + } + + // For small types, extend to 32-bits first. + if (SrcBits < 32) { + SDLoc DL(Op); + MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts); + + SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src); + SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32, + DL, + ExtVT, + TruncSrc, // Operand + DAG.getConstant(0, ExtVT), // Offset + DAG.getConstant(SrcBits, ExtVT)); // Width + + return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32); + } + + // For everything else, use the standard bitshift expansion. + return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); +} + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// @@ -1019,6 +1128,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(BFE_U32) + NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 2efb9c78a3e..2595c51d166 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -142,6 +142,10 @@ private: SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; + + SDValue ExpandSIGN_EXTEND_INREG(SDValue Op, + unsigned BitsDiff, + SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -171,6 +175,8 @@ enum { UMIN, URECIP, DOT4, + BFE_U32, // Extract range of bits with zero extension to 32-bits. + BFE_I32, // Extract range of bits with sign extension to 32-bits. TEXTURE_FETCH, EXPORT, CONST_ADDRESS, diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index fccede01ab9..2138bd23a36 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -86,3 +86,7 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; + +def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>; +def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; + diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 7e7f4d0c004..8874d14c18c 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -68,6 +68,15 @@ public: enum Generation getGeneration() const; bool hasHWFP64() const; bool hasCaymanISA() const; + + bool hasBFE() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFM() const { + return hasBFE(); + } + bool IsIRStructurizerEnabled() const; bool isIfCvtEnabled() const; unsigned getWavefrontSize() const; diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp index 970787ef31e..5dfaad4c1c3 100644 --- a/lib/Target/R600/AMDILISelLowering.cpp +++ b/lib/Target/R600/AMDILISelLowering.cpp @@ -94,9 +94,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() { for (unsigned int x = 0; x < NumTypes; ++x) { MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x]; - //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types - // We cannot sextinreg, expand to shifts - setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); setOperationAction(ISD::SUBE, VT, Expand); setOperationAction(ISD::SUBC, VT, Expand); setOperationAction(ISD::ADDE, VT, Expand); @@ -191,14 +188,12 @@ void AMDGPUTargetLowering::InitAMDILLowering() { setOperationAction(ISD::UDIV, MVT::v4i8, Expand); setOperationAction(ISD::UDIV, MVT::v2i16, Expand); setOperationAction(ISD::UDIV, MVT::v4i16, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); setOperationAction(ISD::SUBC, MVT::Other, Expand); setOperationAction(ISD::ADDE, MVT::Other, Expand); setOperationAction(ISD::ADDC, MVT::Other, Expand); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::BRIND, MVT::Other, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); // Use the default implementation. @@ -322,36 +317,6 @@ AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const { return DST; } -SDValue -AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { - SDValue Data = Op.getOperand(0); - VTSDNode *BaseType = cast(Op.getOperand(1)); - SDLoc DL(Op); - EVT DVT = Data.getValueType(); - EVT BVT = BaseType->getVT(); - unsigned baseBits = BVT.getScalarType().getSizeInBits(); - unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1; - unsigned shiftBits = srcBits - baseBits; - if (srcBits < 32) { - // If the op is less than 32 bits, then it needs to extend to 32bits - // so it can properly keep the upper bits valid. - EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1); - Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data); - shiftBits = 32 - baseBits; - DVT = IVT; - } - SDValue Shift = DAG.getConstant(shiftBits, DVT); - // Shift left by 'Shift' bits. - Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift); - // Signed shift Right by 'Shift' bits. - Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift); - if (srcBits < 32) { - // Once the sign extension is done, the op needs to be converted to - // its original type. - Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType()); - } - return Data; -} EVT AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const { int iSize = (size * numEle); diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td index 6ec3559af24..658deb5bc01 100644 --- a/lib/Target/R600/AMDILIntrinsics.td +++ b/lib/Target/R600/AMDILIntrinsics.td @@ -68,10 +68,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in { let TargetPrefix = "AMDIL", isTarget = 1 in { def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt; - def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">, - TernaryIntInt; - def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">, - TernaryIntInt; def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">, UnaryIntInt; def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">, diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 8c737125c85..4d15321fd02 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -1383,6 +1383,11 @@ SDValue R600TargetLowering::LowerFormalArguments( PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), AMDGPUAS::CONSTANT_BUFFER_0); + // i64 isn't a legal type, so the register type used ends up as i32, which + // isn't expected here. It attempts to create this sextload, but it ends up + // being invalid. Somehow this seems to work with i64 arguments, but breaks + // for <1 x i64>. + // The first 36 bytes of the input buffer contains information about // thread group and global sizes. SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 698ad4afe60..ae3d8747a4d 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1517,15 +1517,20 @@ let Predicates = [isEGorCayman] in { // Example Usage: // (Offset, Width) // - // (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 - // (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 - // (16,8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 - // (24,8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 + // (0, 8) = (Input << 24) >> 24 = (Input & 0xff) >> 0 + // (8, 8) = (Input << 16) >> 24 = (Input & 0xffff) >> 8 + // (16, 8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 + // (24, 8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", - [(set i32:$dst, (int_AMDIL_bit_extract_u32 i32:$src0, i32:$src1, - i32:$src2))], + [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))], VecALU >; + + def BFE_INT_eg : R600_3OP <0x4, "BFE_INT", + [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))], + VecALU + >; + // XXX: This pattern is broken, disabling for now. See comment in // AMDGPUInstructions.td for more info. // def : BFEPattern ; diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 9a18f7bc350..68b89a8c351 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1074,8 +1074,14 @@ def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>; def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>; def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>; def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; -def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>; -def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>; + +let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in { +def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", + [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>; +def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", + [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>; +} + def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>; defm : BFIPatterns ; def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll index 66a070ed9d4..3dcadc93d28 100644 --- a/test/CodeGen/R600/mad_uint24.ll +++ b/test/CodeGen/R600/mad_uint24.ll @@ -26,14 +26,11 @@ entry: ; The order of A and B does not matter. ; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]] ; The result must be sign-extended -; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MAD_CHAN]], literal.x -; EG-CHECK: 16 -; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x +; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x ; EG-CHECK: 16 ; SI-CHECK-LABEL: @i16_mad24 ; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 16, [[MAD]] -; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 16, [[LSHL]] +; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { entry: @@ -51,14 +48,11 @@ entry: ; The order of A and B does not matter. ; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]] ; The result must be sign-extended -; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MAD_CHAN]], literal.x -; EG-CHECK: 24 -; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x -; EG-CHECK: 24 +; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x +; EG-CHECK: 8 ; SI-CHECK-LABEL: @i8_mad24 ; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 24, [[MUL]] -; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 24, [[LSHL]] +; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { entry: diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll index 6e6d5496789..a4139619bfa 100644 --- a/test/CodeGen/R600/mul_uint24.ll +++ b/test/CodeGen/R600/mul_uint24.ll @@ -24,15 +24,11 @@ entry: ; The order of A and B does not matter. ; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]] ; The result must be sign-extended -; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MUL_CHAN]], literal.x -; EG-CHECK: 16 -; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x +; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x ; EG-CHECK: 16 ; SI-CHECK-LABEL: @i16_mul24 ; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 16, [[MUL]] -; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 16, [[LSHL]] - +; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16, define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %0 = mul i16 %a, %b @@ -47,14 +43,10 @@ entry: ; The order of A and B does not matter. ; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]] ; The result must be sign-extended -; EG-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], PV.[[MUL_CHAN]], literal.x -; EG-CHECK: 24 -; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x -; EG-CHECK: 24 +; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x ; SI-CHECK-LABEL: @i8_mul24 ; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 24, [[MUL]] -; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 24, [[LSHL]] +; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8, define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) { entry: diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll new file mode 100644 index 00000000000..f839bf8f8fe --- /dev/null +++ b/test/CodeGen/R600/sext-in-reg.ll @@ -0,0 +1,236 @@ +; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: @sext_in_reg_i1_i32 +; SI: S_LOAD_DWORD [[ARG:s[0-9]+]], +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1 +; SI: BUFFER_STORE_DWORD [[EXTRACT]], + +; EG: BFE_INT +define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) { + %shl = shl i32 %in, 31 + %sext = ashr i32 %shl, 31 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @sext_in_reg_i8_to_i32 +; SI: S_ADD_I32 [[VAL:s[0-9]+]], +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 8 +; SI: BUFFER_STORE_DWORD [[EXTRACT]], + +; EG: BFE_INT +define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %shl = shl i32 %c, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @sext_in_reg_i16_to_i32 +; SI: S_ADD_I32 [[VAL:s[0-9]+]], +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 16 +; SI: BUFFER_STORE_DWORD [[EXTRACT]], + +; EG: BFE_INT +define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b ; add to prevent folding into extload + %shl = shl i32 %c, 16 + %ashr = ashr i32 %shl, 16 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @sext_in_reg_i8_to_v1i32 +; SI: S_ADD_I32 [[VAL:s[0-9]+]], +; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[VAL]], 0, 8 +; SI: BUFFER_STORE_DWORD [[EXTRACT]], + +; EG: BFE_INT +define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { + %c = add <1 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <1 x i32> %c, + %ashr = ashr <1 x i32> %shl, + store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @sext_in_reg_i8_to_i64 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, +; SI: BUFFER_STORE_DWORD + +; EG: BFE_INT +; EG: ASHR +define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = add i64 %a, %b + %shl = shl i64 %c, 56 + %ashr = ashr i64 %shl, 56 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @sext_in_reg_i16_to_i64 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 16 +; SI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, +; SI: BUFFER_STORE_DWORD + +; EG: BFE_INT +; EG: ASHR +define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { + %c = add i64 %a, %b + %shl = shl i64 %c, 48 + %ashr = ashr i64 %shl, 48 + store i64 %ashr, i64 addrspace(1)* %out, align 8 + ret void +} + +; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments. +; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64 +; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31, +; XSI: BUFFER_STORE_DWORD +; XEG: BFE_INT +; XEG: ASHR +; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind { +; %c = add <1 x i64> %a, %b +; %shl = shl <1 x i64> %c, +; %ashr = ashr <1 x i64> %shl, +; store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8 +; ret void +; } + +; FUNC-LABEL: @sext_in_reg_i1_in_i32_other_amount +; SI-NOT: BFE +; SI: S_LSHL_B32 [[REG:s[0-9]+]], {{s[0-9]+}}, 6 +; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG]], 7 +; EG-NOT: BFE +define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { + %c = add i32 %a, %b + %x = shl i32 %c, 6 + %y = ashr i32 %x, 7 + store i32 %y, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @sext_in_reg_v2i1_in_v2i32_other_amount +; SI: S_LSHL_B32 [[REG0:s[0-9]+]], {{s[0-9]}}, 6 +; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG0]], 7 +; SI: S_LSHL_B32 [[REG1:s[0-9]+]], {{s[0-9]}}, 6 +; SI: S_ASHR_I32 {{s[0-9]+}}, [[REG1]], 7 +; EG-NOT: BFE +define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b + %x = shl <2 x i32> %c, + %y = ashr <2 x i32> %x, + store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 + ret void +} + + +; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 +; SI: BUFFER_STORE_DWORDX2 +; EG: BFE +; EG: BFE +define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1 +; SI: BUFFER_STORE_DWORDX4 + +; EG: BFE +; EG: BFE +; EG: BFE +; EG: BFE +define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %c = add <4 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @sext_in_reg_v2i8_to_v2i32 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: BUFFER_STORE_DWORDX2 + +; EG: BFE +; EG: BFE +define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @sext_in_reg_v4i8_to_v4i32 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: BUFFER_STORE_DWORDX4 + +; EG: BFE +; EG: BFE +; EG: BFE +; EG: BFE +define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { + %c = add <4 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <4 x i32> %c, + %ashr = ashr <4 x i32> %shl, + store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @sext_in_reg_v2i16_to_v2i32 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8 +; SI: BUFFER_STORE_DWORDX2 + +; EG: BFE +; EG: BFE +define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { + %c = add <2 x i32> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i32> %c, + %ashr = ashr <2 x i32> %shl, + store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @testcase +define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind { + %and_a_1 = and i8 %a, 1 + %cmp_eq = icmp eq i8 %and_a_1, 0 + %cmp_slt = icmp slt i8 %a, 0 + %sel0 = select i1 %cmp_slt, i8 0, i8 %a + %sel1 = select i1 %cmp_eq, i8 0, i8 %a + %xor = xor i8 %sel0, %sel1 + store i8 %xor, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @testcase_3 +define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind { + %and_a_1 = and i8 %a, 1 + %cmp_eq = icmp eq i8 %and_a_1, 0 + %cmp_slt = icmp slt i8 %a, 0 + %sel0 = select i1 %cmp_slt, i8 0, i8 %a + %sel1 = select i1 %cmp_eq, i8 0, i8 %a + %xor = xor i8 %sel0, %sel1 + store i8 %xor, i8 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/v1i64-kernel-arg.ll b/test/CodeGen/R600/v1i64-kernel-arg.ll new file mode 100644 index 00000000000..2aa1221b366 --- /dev/null +++ b/test/CodeGen/R600/v1i64-kernel-arg.ll @@ -0,0 +1,17 @@ +; REQUIRES: asserts +; XFAIL: * +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s + +; CHECK-LABEL: @kernel_arg_i64 +define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { + store i64 %a, i64 addrspace(1)* %out, align 8 + ret void +} + +; i64 arg works, v1i64 arg does not. +; CHECK-LABEL: @kernel_arg_v1i64 +define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { + store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 + ret void +} + -- 2.11.0