From a41520cf9b9cefed2091a0624a34c5f7fdb42a68 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 14 Aug 2013 23:25:00 +0000 Subject: [PATCH] R600/SI: Improve legalization of vector operations This should fix hangs in the OpenCL piglit tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@188431 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 49 ++++++++++++++- lib/Target/R600/AMDGPUISelLowering.h | 5 ++ lib/Target/R600/SIISelLowering.cpp | 6 +- lib/Target/R600/SIISelLowering.h | 1 + test/CodeGen/R600/si-vector-hang.ll | 111 +++++++++++++++++++++++++++++++++ 5 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 test/CodeGen/R600/si-vector-hang.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 9bb487e550c..1e799988987 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -79,8 +79,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::f64, Promote); AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Expand); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); setOperationAction(ISD::FNEG, MVT::v2f32, Expand); setOperationAction(ISD::FNEG, MVT::v4f32, Expand); @@ -182,6 +184,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); // AMDGPU DAG lowering + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); } @@ -208,6 +212,47 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, return DAG.getConstant(Offset, TD->getPointerSize() == 8 ? MVT::i64 : MVT::i32); } +void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG, + SmallVectorImpl &Args, + unsigned Start, + unsigned Count) const { + EVT VT = Op.getValueType(); + for (unsigned i = Start, e = Start + Count; i != e; ++i) { + Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), + VT.getVectorElementType(), + Op, DAG.getConstant(i, MVT::i32))); + } +} + +SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, + SelectionDAG &DAG) const { + SmallVector Args; + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + ExtractVectorElements(A, DAG, Args, 0, + A.getValueType().getVectorNumElements()); + ExtractVectorElements(B, DAG, Args, 0, + B.getValueType().getVectorNumElements()); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), + &Args[0], Args.size()); +} + +SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + + SmallVector Args; + EVT VT = Op.getValueType(); + unsigned Start = cast(Op.getOperand(1))->getZExtValue(); + ExtractVectorElements(Op.getOperand(0), DAG, Args, Start, + VT.getVectorNumElements()); + + return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), + &Args[0], Args.size()); +} + + SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 5419e71e7f2..9adbb543d34 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -25,6 +25,11 @@ class MachineRegisterInfo; class AMDGPUTargetLowering : public TargetLowering { private: + void ExtractVectorElements(SDValue Op, SelectionDAG &DAG, + SmallVectorImpl &Args, + unsigned Start, unsigned Count) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 30a510de91c..0bd8bce51a3 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -34,9 +34,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); - addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass); - addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass); - addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); @@ -110,6 +107,9 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, return VT.bitsGT(MVT::i32); } +bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const { + return VT.bitsLE(MVT::i8); +} SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL, SDValue Chain, diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 321e58c153b..9c54a6f48aa 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -47,6 +47,7 @@ class SITargetLowering : public AMDGPUTargetLowering { public: SITargetLowering(TargetMachine &tm); bool allowsUnalignedMemoryAccesses(EVT VT, bool *IsFast) const; + virtual bool shouldSplitVectorElementType(EVT VT) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll new file mode 100644 index 00000000000..0b0e210d5a6 --- /dev/null +++ b/test/CodeGen/R600/si-vector-hang.ll @@ -0,0 +1,111 @@ +; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s + +; XXX: Mark this test as XFAIL until buffer stores are implemented +; XFAIL: * +; CHECK: @test_8_min_char +; CHECK: BUFFER_STORE_BYTE +; CHECK: BUFFER_STORE_BYTE +; CHECK: BUFFER_STORE_BYTE +; CHECK: BUFFER_STORE_BYTE +; CHECK: BUFFER_STORE_BYTE +; CHECK: BUFFER_STORE_BYTE +; CHECK: BUFFER_STORE_BYTE +; CHECK: BUFFER_STORE_BYTE +; ModuleID = 'radeon' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" +target triple = "r600--" + +; Function Attrs: nounwind +define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { +entry: + %0 = load i8 addrspace(1)* %in0, align 1, !tbaa !9 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %arrayidx2.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 1 + %2 = load i8 addrspace(1)* %arrayidx2.i.i, align 1, !tbaa !9 + %3 = insertelement <8 x i8> %1, i8 %2, i32 1 + %arrayidx6.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 2 + %4 = load i8 addrspace(1)* %arrayidx6.i.i, align 1, !tbaa !9 + %5 = insertelement <8 x i8> %3, i8 %4, i32 2 + %arrayidx10.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 3 + %6 = load i8 addrspace(1)* %arrayidx10.i.i, align 1, !tbaa !9 + %7 = insertelement <8 x i8> %5, i8 %6, i32 3 + %arrayidx.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 4 + %8 = load i8 addrspace(1)* %arrayidx.i.i, align 1, !tbaa !9 + %9 = insertelement <8 x i8> undef, i8 %8, i32 0 + %arrayidx2.i9.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 5 + %10 = load i8 addrspace(1)* %arrayidx2.i9.i, align 1, !tbaa !9 + %11 = insertelement <8 x i8> %9, i8 %10, i32 1 + %arrayidx6.i11.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 6 + %12 = load i8 addrspace(1)* %arrayidx6.i11.i, align 1, !tbaa !9 + %13 = insertelement <8 x i8> %11, i8 %12, i32 2 + %arrayidx10.i13.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 7 + %14 = load i8 addrspace(1)* %arrayidx10.i13.i, align 1, !tbaa !9 + %15 = insertelement <8 x i8> %13, i8 %14, i32 3 + %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> + %16 = load i8 addrspace(1)* %in1, align 1, !tbaa !9 + %17 = insertelement <8 x i8> undef, i8 %16, i32 0 + %arrayidx2.i.i4 = getelementptr inbounds i8 addrspace(1)* %in1, i64 1 + %18 = load i8 addrspace(1)* %arrayidx2.i.i4, align 1, !tbaa !9 + %19 = insertelement <8 x i8> %17, i8 %18, i32 1 + %arrayidx6.i.i5 = getelementptr inbounds i8 addrspace(1)* %in1, i64 2 + %20 = load i8 addrspace(1)* %arrayidx6.i.i5, align 1, !tbaa !9 + %21 = insertelement <8 x i8> %19, i8 %20, i32 2 + %arrayidx10.i.i6 = getelementptr inbounds i8 addrspace(1)* %in1, i64 3 + %22 = load i8 addrspace(1)* %arrayidx10.i.i6, align 1, !tbaa !9 + %23 = insertelement <8 x i8> %21, i8 %22, i32 3 + %arrayidx.i.i7 = getelementptr inbounds i8 addrspace(1)* %in1, i64 4 + %24 = load i8 addrspace(1)* %arrayidx.i.i7, align 1, !tbaa !9 + %25 = insertelement <8 x i8> undef, i8 %24, i32 0 + %arrayidx2.i9.i8 = getelementptr inbounds i8 addrspace(1)* %in1, i64 5 + %26 = load i8 addrspace(1)* %arrayidx2.i9.i8, align 1, !tbaa !9 + %27 = insertelement <8 x i8> %25, i8 %26, i32 1 + %arrayidx6.i11.i9 = getelementptr inbounds i8 addrspace(1)* %in1, i64 6 + %28 = load i8 addrspace(1)* %arrayidx6.i11.i9, align 1, !tbaa !9 + %29 = insertelement <8 x i8> %27, i8 %28, i32 2 + %arrayidx10.i13.i10 = getelementptr inbounds i8 addrspace(1)* %in1, i64 7 + %30 = load i8 addrspace(1)* %arrayidx10.i13.i10, align 1, !tbaa !9 + %31 = insertelement <8 x i8> %29, i8 %30, i32 3 + %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> + %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11 + %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11 + %32 = extractelement <8 x i8> %cond.i, i32 0 + store i8 %32, i8 addrspace(1)* %out, align 1, !tbaa !9 + %33 = extractelement <8 x i8> %cond.i, i32 1 + %arrayidx2.i.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 1 + store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1, !tbaa !9 + %34 = extractelement <8 x i8> %cond.i, i32 2 + %arrayidx.i.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 2 + store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1, !tbaa !9 + %35 = extractelement <8 x i8> %cond.i, i32 3 + %arrayidx2.i6.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 3 + store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1, !tbaa !9 + %arrayidx.i.i3 = getelementptr inbounds i8 addrspace(1)* %out, i64 4 + %36 = extractelement <8 x i8> %cond.i, i32 4 + store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1, !tbaa !9 + %37 = extractelement <8 x i8> %cond.i, i32 5 + %arrayidx2.i.i6.i = getelementptr inbounds i8 addrspace(1)* %out, i64 5 + store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1, !tbaa !9 + %38 = extractelement <8 x i8> %cond.i, i32 6 + %arrayidx.i.i7.i = getelementptr inbounds i8 addrspace(1)* %out, i64 6 + store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1, !tbaa !9 + %39 = extractelement <8 x i8> %cond.i, i32 7 + %arrayidx2.i6.i8.i = getelementptr inbounds i8 addrspace(1)* %out, i64 7 + store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1, !tbaa !9 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8} + +!0 = metadata !{null} +!1 = metadata !{null} +!2 = metadata !{null} +!3 = metadata !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char} +!4 = metadata !{null} +!5 = metadata !{null} +!6 = metadata !{null} +!7 = metadata !{null} +!8 = metadata !{null} +!9 = metadata !{metadata !"omnipotent char", metadata !10} +!10 = metadata !{metadata !"Simple C/C++ TBAA"} -- 2.11.0