From 7eacad03efda36e09ebd96e95d7891cadaaa9087 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 12 Feb 2013 14:18:49 +0000 Subject: [PATCH] [NVPTX] Disable vector registers Vectors were being manually scalarized by the backend. Instead, let the target-independent code do all of the work. The manual scalarization was from a time before good target-independent support for scalarization in LLVM. However, this forces us to specially-handle vector loads and stores, which we can turn into PTX instructions that produce/consume multiple operands. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@174968 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsNVVM.td | 10 + lib/Target/NVPTX/CMakeLists.txt | 1 - lib/Target/NVPTX/NVPTX.h | 1 - lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 46 +- lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 809 +++++++++++++++++--- lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 7 +- lib/Target/NVPTX/NVPTXISelLowering.cpp | 458 ++++++++++-- lib/Target/NVPTX/NVPTXISelLowering.h | 21 +- lib/Target/NVPTX/NVPTXInstrInfo.cpp | 40 - lib/Target/NVPTX/NVPTXInstrInfo.td | 96 ++- lib/Target/NVPTX/NVPTXIntrinsics.td | 145 ++-- lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 156 ---- lib/Target/NVPTX/NVPTXRegisterInfo.h | 4 - lib/Target/NVPTX/NVPTXRegisterInfo.td | 44 -- lib/Target/NVPTX/NVPTXSubtarget.h | 1 + lib/Target/NVPTX/NVPTXTargetMachine.cpp | 1 - lib/Target/NVPTX/VectorElementize.cpp | 1239 ------------------------------- lib/Target/NVPTX/gen-register-defs.py | 202 ----- test/CodeGen/NVPTX/vector-loads.ll | 66 ++ 19 files changed, 1350 insertions(+), 1997 deletions(-) delete mode 100644 lib/Target/NVPTX/VectorElementize.cpp delete mode 100644 lib/Target/NVPTX/gen-register-defs.py create mode 100644 test/CodeGen/NVPTX/vector-loads.ll diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td index 6b853001e77..ebfd03e4849 100644 --- a/include/llvm/IR/IntrinsicsNVVM.td +++ b/include/llvm/IR/IntrinsicsNVVM.td @@ -805,6 +805,16 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty], [LLVMPointerType>], [IntrReadMem, NoCapture<0>], "llvm.nvvm.ldu.global.p">; +// Generated within nvvm. Use for ldg on sm_35 or later +def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty], + [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + "llvm.nvvm.ldg.global.i">; +def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty], + [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + "llvm.nvvm.ldg.global.f">; +def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty], + [LLVMPointerType>], [IntrReadMem, NoCapture<0>], + "llvm.nvvm.ldg.global.p">; // Use for generic pointers // - These intrinsics are used to convert address spaces. diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt index 7cb16b4dd81..47baef66961 100644 --- a/lib/Target/NVPTX/CMakeLists.txt +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -22,7 +22,6 @@ set(NVPTXCodeGen_sources NVPTXAllocaHoisting.cpp NVPTXAsmPrinter.cpp NVPTXUtilities.cpp - VectorElementize.cpp ) add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources}) diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 097b50aa4e1..b46ea881c4b 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -53,7 +53,6 @@ inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) { FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel); -FunctionPass *createVectorElementizePass(NVPTXTargetMachine &); FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &); FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &); FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &); diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 3f99d1d0e4a..0115e1f5d3a 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -503,21 +503,7 @@ NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec, O << getNVPTXRegClassStr(RC) << mapped_vr; return; } - // Vector virtual register - if (getNVPTXVectorSize(RC) == 4) - O << "{" - << getNVPTXRegClassStr(RC) << mapped_vr << "_0, " - << getNVPTXRegClassStr(RC) << mapped_vr << "_1, " - << getNVPTXRegClassStr(RC) << mapped_vr << "_2, " - << getNVPTXRegClassStr(RC) << mapped_vr << "_3" - << "}"; - else if (getNVPTXVectorSize(RC) == 2) - O << "{" - << getNVPTXRegClassStr(RC) << mapped_vr << "_0, " - << getNVPTXRegClassStr(RC) << mapped_vr << "_1" - << "}"; - else - llvm_unreachable("Unsupported vector size"); + report_fatal_error("Bad register!"); } void @@ -2024,29 +2010,9 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) case NVPTX::StoreParamI64: case NVPTX::StoreParamI8: case NVPTX::StoreParamS32I8: case NVPTX::StoreParamU32I8: case NVPTX::StoreParamS32I16: case NVPTX::StoreParamU32I16: - case NVPTX::StoreParamScalar2F32: case NVPTX::StoreParamScalar2F64: - case NVPTX::StoreParamScalar2I16: case NVPTX::StoreParamScalar2I32: - case NVPTX::StoreParamScalar2I64: case NVPTX::StoreParamScalar2I8: - case NVPTX::StoreParamScalar4F32: case NVPTX::StoreParamScalar4I16: - case NVPTX::StoreParamScalar4I32: case NVPTX::StoreParamScalar4I8: - case NVPTX::StoreParamV2F32: case NVPTX::StoreParamV2F64: - case NVPTX::StoreParamV2I16: case NVPTX::StoreParamV2I32: - case NVPTX::StoreParamV2I64: case NVPTX::StoreParamV2I8: - case NVPTX::StoreParamV4F32: case NVPTX::StoreParamV4I16: - case NVPTX::StoreParamV4I32: case NVPTX::StoreParamV4I8: case NVPTX::StoreRetvalF32: case NVPTX::StoreRetvalF64: case NVPTX::StoreRetvalI16: case NVPTX::StoreRetvalI32: case NVPTX::StoreRetvalI64: case NVPTX::StoreRetvalI8: - case NVPTX::StoreRetvalScalar2F32: case NVPTX::StoreRetvalScalar2F64: - case NVPTX::StoreRetvalScalar2I16: case NVPTX::StoreRetvalScalar2I32: - case NVPTX::StoreRetvalScalar2I64: case NVPTX::StoreRetvalScalar2I8: - case NVPTX::StoreRetvalScalar4F32: case NVPTX::StoreRetvalScalar4I16: - case NVPTX::StoreRetvalScalar4I32: case NVPTX::StoreRetvalScalar4I8: - case NVPTX::StoreRetvalV2F32: case NVPTX::StoreRetvalV2F64: - case NVPTX::StoreRetvalV2I16: case NVPTX::StoreRetvalV2I32: - case NVPTX::StoreRetvalV2I64: case NVPTX::StoreRetvalV2I8: - case NVPTX::StoreRetvalV4F32: case NVPTX::StoreRetvalV4I16: - case NVPTX::StoreRetvalV4I32: case NVPTX::StoreRetvalV4I8: case NVPTX::LastCallArgF32: case NVPTX::LastCallArgF64: case NVPTX::LastCallArgI16: case NVPTX::LastCallArgI32: case NVPTX::LastCallArgI32imm: case NVPTX::LastCallArgI64: @@ -2057,16 +2023,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) case NVPTX::LoadParamRegF32: case NVPTX::LoadParamRegF64: case NVPTX::LoadParamRegI16: case NVPTX::LoadParamRegI32: case NVPTX::LoadParamRegI64: case NVPTX::LoadParamRegI8: - case NVPTX::LoadParamScalar2F32: case NVPTX::LoadParamScalar2F64: - case NVPTX::LoadParamScalar2I16: case NVPTX::LoadParamScalar2I32: - case NVPTX::LoadParamScalar2I64: case NVPTX::LoadParamScalar2I8: - case NVPTX::LoadParamScalar4F32: case NVPTX::LoadParamScalar4I16: - case NVPTX::LoadParamScalar4I32: case NVPTX::LoadParamScalar4I8: - case NVPTX::LoadParamV2F32: case NVPTX::LoadParamV2F64: - case NVPTX::LoadParamV2I16: case NVPTX::LoadParamV2I32: - case NVPTX::LoadParamV2I64: case NVPTX::LoadParamV2I8: - case NVPTX::LoadParamV4F32: case NVPTX::LoadParamV4I16: - case NVPTX::LoadParamV4I32: case NVPTX::LoadParamV4I8: case NVPTX::PrototypeInst: case NVPTX::DBG_VALUE: return true; } diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 36ab7f5c156..481f13afd18 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -105,6 +105,21 @@ SDNode* NVPTXDAGToDAGISel::Select(SDNode *N) { case ISD::STORE: ResNode = SelectStore(N); break; + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: + ResNode = SelectLoadVector(N); + break; + case NVPTXISD::LDGV2: + case NVPTXISD::LDGV4: + case NVPTXISD::LDUV2: + case NVPTXISD::LDUV4: + ResNode = SelectLDGLDUVector(N); + break; + case NVPTXISD::StoreV2: + case NVPTXISD::StoreV4: + ResNode = SelectStoreVector(N); + break; + default: break; } if (ResNode) return ResNode; @@ -214,16 +229,6 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { case MVT::i64: Opcode = NVPTX::LD_i64_avar; break; case MVT::f32: Opcode = NVPTX::LD_f32_avar; break; case MVT::f64: Opcode = NVPTX::LD_f64_avar; break; - case MVT::v2i8: Opcode = NVPTX::LD_v2i8_avar; break; - case MVT::v2i16: Opcode = NVPTX::LD_v2i16_avar; break; - case MVT::v2i32: Opcode = NVPTX::LD_v2i32_avar; break; - case MVT::v2i64: Opcode = NVPTX::LD_v2i64_avar; break; - case MVT::v2f32: Opcode = NVPTX::LD_v2f32_avar; break; - case MVT::v2f64: Opcode = NVPTX::LD_v2f64_avar; break; - case MVT::v4i8: Opcode = NVPTX::LD_v4i8_avar; break; - case MVT::v4i16: Opcode = NVPTX::LD_v4i16_avar; break; - case MVT::v4i32: Opcode = NVPTX::LD_v4i32_avar; break; - case MVT::v4f32: Opcode = NVPTX::LD_v4f32_avar; break; default: return NULL; } SDValue Ops[] = { getI32Imm(isVolatile), @@ -244,16 +249,6 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { case MVT::i64: Opcode = NVPTX::LD_i64_asi; break; case MVT::f32: Opcode = NVPTX::LD_f32_asi; break; case MVT::f64: Opcode = NVPTX::LD_f64_asi; break; - case MVT::v2i8: Opcode = NVPTX::LD_v2i8_asi; break; - case MVT::v2i16: Opcode = NVPTX::LD_v2i16_asi; break; - case MVT::v2i32: Opcode = NVPTX::LD_v2i32_asi; break; - case MVT::v2i64: Opcode = NVPTX::LD_v2i64_asi; break; - case MVT::v2f32: Opcode = NVPTX::LD_v2f32_asi; break; - case MVT::v2f64: Opcode = NVPTX::LD_v2f64_asi; break; - case MVT::v4i8: Opcode = NVPTX::LD_v4i8_asi; break; - case MVT::v4i16: Opcode = NVPTX::LD_v4i16_asi; break; - case MVT::v4i32: Opcode = NVPTX::LD_v4i32_asi; break; - case MVT::v4f32: Opcode = NVPTX::LD_v4f32_asi; break; default: return NULL; } SDValue Ops[] = { getI32Imm(isVolatile), @@ -267,24 +262,26 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { } else if (Subtarget.is64Bit()? SelectADDRri64(N1.getNode(), N1, Base, Offset): SelectADDRri(N1.getNode(), N1, Base, Offset)) { - switch (TargetVT) { - case MVT::i8: Opcode = NVPTX::LD_i8_ari; break; - case MVT::i16: Opcode = NVPTX::LD_i16_ari; break; - case MVT::i32: Opcode = NVPTX::LD_i32_ari; break; - case MVT::i64: Opcode = NVPTX::LD_i64_ari; break; - case MVT::f32: Opcode = NVPTX::LD_f32_ari; break; - case MVT::f64: Opcode = NVPTX::LD_f64_ari; break; - case MVT::v2i8: Opcode = NVPTX::LD_v2i8_ari; break; - case MVT::v2i16: Opcode = NVPTX::LD_v2i16_ari; break; - case MVT::v2i32: Opcode = NVPTX::LD_v2i32_ari; break; - case MVT::v2i64: Opcode = NVPTX::LD_v2i64_ari; break; - case MVT::v2f32: Opcode = NVPTX::LD_v2f32_ari; break; - case MVT::v2f64: Opcode = NVPTX::LD_v2f64_ari; break; - case MVT::v4i8: Opcode = NVPTX::LD_v4i8_ari; break; - case MVT::v4i16: Opcode = NVPTX::LD_v4i16_ari; break; - case MVT::v4i32: Opcode = NVPTX::LD_v4i32_ari; break; - case MVT::v4f32: Opcode = NVPTX::LD_v4f32_ari; break; - default: return NULL; + if (Subtarget.is64Bit()) { + switch (TargetVT) { + case MVT::i8: Opcode = NVPTX::LD_i8_ari_64; break; + case MVT::i16: Opcode = NVPTX::LD_i16_ari_64; break; + case MVT::i32: Opcode = NVPTX::LD_i32_ari_64; break; + case MVT::i64: Opcode = NVPTX::LD_i64_ari_64; break; + case MVT::f32: Opcode = NVPTX::LD_f32_ari_64; break; + case MVT::f64: Opcode = NVPTX::LD_f64_ari_64; break; + default: return NULL; + } + } else { + switch (TargetVT) { + case MVT::i8: Opcode = NVPTX::LD_i8_ari; break; + case MVT::i16: Opcode = NVPTX::LD_i16_ari; break; + case MVT::i32: Opcode = NVPTX::LD_i32_ari; break; + case MVT::i64: Opcode = NVPTX::LD_i64_ari; break; + case MVT::f32: Opcode = NVPTX::LD_f32_ari; break; + case MVT::f64: Opcode = NVPTX::LD_f64_ari; break; + default: return NULL; + } } SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace), @@ -296,24 +293,26 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { MVT::Other, Ops, 8); } else { - switch (TargetVT) { - case MVT::i8: Opcode = NVPTX::LD_i8_areg; break; - case MVT::i16: Opcode = NVPTX::LD_i16_areg; break; - case MVT::i32: Opcode = NVPTX::LD_i32_areg; break; - case MVT::i64: Opcode = NVPTX::LD_i64_areg; break; - case MVT::f32: Opcode = NVPTX::LD_f32_areg; break; - case MVT::f64: Opcode = NVPTX::LD_f64_areg; break; - case MVT::v2i8: Opcode = NVPTX::LD_v2i8_areg; break; - case MVT::v2i16: Opcode = NVPTX::LD_v2i16_areg; break; - case MVT::v2i32: Opcode = NVPTX::LD_v2i32_areg; break; - case MVT::v2i64: Opcode = NVPTX::LD_v2i64_areg; break; - case MVT::v2f32: Opcode = NVPTX::LD_v2f32_areg; break; - case MVT::v2f64: Opcode = NVPTX::LD_v2f64_areg; break; - case MVT::v4i8: Opcode = NVPTX::LD_v4i8_areg; break; - case MVT::v4i16: Opcode = NVPTX::LD_v4i16_areg; break; - case MVT::v4i32: Opcode = NVPTX::LD_v4i32_areg; break; - case MVT::v4f32: Opcode = NVPTX::LD_v4f32_areg; break; - default: return NULL; + if (Subtarget.is64Bit()) { + switch (TargetVT) { + case MVT::i8: Opcode = NVPTX::LD_i8_areg_64; break; + case MVT::i16: Opcode = NVPTX::LD_i16_areg_64; break; + case MVT::i32: Opcode = NVPTX::LD_i32_areg_64; break; + case MVT::i64: Opcode = NVPTX::LD_i64_areg_64; break; + case MVT::f32: Opcode = NVPTX::LD_f32_areg_64; break; + case MVT::f64: Opcode = NVPTX::LD_f64_areg_64; break; + default: return NULL; + } + } else { + switch (TargetVT) { + case MVT::i8: Opcode = NVPTX::LD_i8_areg; break; + case MVT::i16: Opcode = NVPTX::LD_i16_areg; break; + case MVT::i32: Opcode = NVPTX::LD_i32_areg; break; + case MVT::i64: Opcode = NVPTX::LD_i64_areg; break; + case MVT::f32: Opcode = NVPTX::LD_f32_areg; break; + case MVT::f64: Opcode = NVPTX::LD_f64_areg; break; + default: return NULL; + } } SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace), @@ -334,6 +333,370 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { return NVPTXLD; } +SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { + + SDValue Chain = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue Addr, Offset, Base; + unsigned Opcode; + DebugLoc DL = N->getDebugLoc(); + SDNode *LD; + MemSDNode *MemSD = cast(N); + EVT LoadedVT = MemSD->getMemoryVT(); + + + if (!LoadedVT.isSimple()) + return NULL; + + // Address Space Setting + unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget); + + // Volatile Setting + // - .volatile is only availalble for .global and .shared + bool IsVolatile = MemSD->isVolatile(); + if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && + CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && + CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) + IsVolatile = false; + + // Vector Setting + MVT SimpleVT = LoadedVT.getSimpleVT(); + + // Type Setting: fromType + fromTypeWidth + // + // Sign : ISD::SEXTLOAD + // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the + // type is integer + // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float + MVT ScalarVT = SimpleVT.getScalarType(); + unsigned FromTypeWidth = ScalarVT.getSizeInBits(); + unsigned int FromType; + // The last operand holds the original LoadSDNode::getExtensionType() value + unsigned ExtensionType = + cast(N->getOperand(N->getNumOperands()-1))->getZExtValue(); + if (ExtensionType == ISD::SEXTLOAD) + FromType = NVPTX::PTXLdStInstCode::Signed; + else if (ScalarVT.isFloatingPoint()) + FromType = NVPTX::PTXLdStInstCode::Float; + else + FromType = NVPTX::PTXLdStInstCode::Unsigned; + + unsigned VecType; + + switch (N->getOpcode()) { + case NVPTXISD::LoadV2: VecType = NVPTX::PTXLdStInstCode::V2; break; + case NVPTXISD::LoadV4: VecType = NVPTX::PTXLdStInstCode::V4; break; + default: return NULL; + } + + EVT EltVT = N->getValueType(0); + + if (SelectDirectAddr(Op1, Addr)) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::LoadV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v2_avar; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v2_avar; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v2_avar; break; + case MVT::i64: Opcode = NVPTX::LDV_i64_v2_avar; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v2_avar; break; + case MVT::f64: Opcode = NVPTX::LDV_f64_v2_avar; break; + } + break; + case NVPTXISD::LoadV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v4_avar; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v4_avar; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v4_avar; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v4_avar; break; + } + break; + } + + SDValue Ops[] = { getI32Imm(IsVolatile), + getI32Imm(CodeAddrSpace), + getI32Imm(VecType), + getI32Imm(FromType), + getI32Imm(FromTypeWidth), + Addr, Chain }; + LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 7); + } else if (Subtarget.is64Bit()? + SelectADDRsi64(Op1.getNode(), Op1, Base, Offset): + SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::LoadV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v2_asi; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v2_asi; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v2_asi; break; + case MVT::i64: Opcode = NVPTX::LDV_i64_v2_asi; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v2_asi; break; + case MVT::f64: Opcode = NVPTX::LDV_f64_v2_asi; break; + } + break; + case NVPTXISD::LoadV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v4_asi; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v4_asi; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v4_asi; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v4_asi; break; + } + break; + } + + SDValue Ops[] = { getI32Imm(IsVolatile), + getI32Imm(CodeAddrSpace), + getI32Imm(VecType), + getI32Imm(FromType), + getI32Imm(FromTypeWidth), + Base, Offset, Chain }; + LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 8); + } else if (Subtarget.is64Bit()? + SelectADDRri64(Op1.getNode(), Op1, Base, Offset): + SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { + if (Subtarget.is64Bit()) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::LoadV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v2_ari_64; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v2_ari_64; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v2_ari_64; break; + case MVT::i64: Opcode = NVPTX::LDV_i64_v2_ari_64; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v2_ari_64; break; + case MVT::f64: Opcode = NVPTX::LDV_f64_v2_ari_64; break; + } + break; + case NVPTXISD::LoadV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v4_ari_64; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v4_ari_64; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v4_ari_64; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v4_ari_64; break; + } + break; + } + } else { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::LoadV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v2_ari; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v2_ari; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v2_ari; break; + case MVT::i64: Opcode = NVPTX::LDV_i64_v2_ari; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v2_ari; break; + case MVT::f64: Opcode = NVPTX::LDV_f64_v2_ari; break; + } + break; + case NVPTXISD::LoadV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v4_ari; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v4_ari; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v4_ari; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v4_ari; break; + } + break; + } + } + + SDValue Ops[] = { getI32Imm(IsVolatile), + getI32Imm(CodeAddrSpace), + getI32Imm(VecType), + getI32Imm(FromType), + getI32Imm(FromTypeWidth), + Base, Offset, Chain }; + + LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 8); + } else { + if (Subtarget.is64Bit()) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::LoadV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v2_areg_64; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v2_areg_64; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v2_areg_64; break; + case MVT::i64: Opcode = NVPTX::LDV_i64_v2_areg_64; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v2_areg_64; break; + case MVT::f64: Opcode = NVPTX::LDV_f64_v2_areg_64; break; + } + break; + case NVPTXISD::LoadV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v4_areg_64; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v4_areg_64; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v4_areg_64; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v4_areg_64; break; + } + break; + } + } else { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::LoadV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v2_areg; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v2_areg; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v2_areg; break; + case MVT::i64: Opcode = NVPTX::LDV_i64_v2_areg; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v2_areg; break; + case MVT::f64: Opcode = NVPTX::LDV_f64_v2_areg; break; + } + break; + case NVPTXISD::LoadV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::LDV_i8_v4_areg; break; + case MVT::i16: Opcode = NVPTX::LDV_i16_v4_areg; break; + case MVT::i32: Opcode = NVPTX::LDV_i32_v4_areg; break; + case MVT::f32: Opcode = NVPTX::LDV_f32_v4_areg; break; + } + break; + } + } + + SDValue Ops[] = { getI32Imm(IsVolatile), + getI32Imm(CodeAddrSpace), + getI32Imm(VecType), + getI32Imm(FromType), + getI32Imm(FromTypeWidth), + Op1, Chain }; + LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 7); + } + + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast(N)->getMemOperand(); + cast(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); + + return LD; +} + +SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { + + SDValue Chain = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + unsigned Opcode; + DebugLoc DL = N->getDebugLoc(); + SDNode *LD; + + EVT RetVT = N->getValueType(0); + + // Select opcode + if (Subtarget.is64Bit()) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::LDGV2: + switch (RetVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_64; break; + case MVT::i16: Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_64; break; + case MVT::i32: Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_64; break; + case MVT::i64: Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_64; break; + case MVT::f32: Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_64; break; + case MVT::f64: Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_64; break; + } + break; + case NVPTXISD::LDGV4: + switch (RetVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_64; break; + case MVT::i16: Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_64; break; + case MVT::i32: Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_64; break; + case MVT::f32: Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_64; break; + } + break; + case NVPTXISD::LDUV2: + switch (RetVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_64; break; + case MVT::i16: Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_64; break; + case MVT::i32: Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_64; break; + case MVT::i64: Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_64; break; + case MVT::f32: Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_64; break; + case MVT::f64: Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_64; break; + } + break; + case NVPTXISD::LDUV4: + switch (RetVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_64; break; + case MVT::i16: Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_64; break; + case MVT::i32: Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_64; break; + case MVT::f32: Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_64; break; + } + break; + } + } else { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::LDGV2: + switch (RetVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_32; break; + case MVT::i16: Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_32; break; + case MVT::i32: Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_32; break; + case MVT::i64: Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_32; break; + case MVT::f32: Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_32; break; + case MVT::f64: Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_32; break; + } + break; + case NVPTXISD::LDGV4: + switch (RetVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_32; break; + case MVT::i16: Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_32; break; + case MVT::i32: Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_32; break; + case MVT::f32: Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_32; break; + } + break; + case NVPTXISD::LDUV2: + switch (RetVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_32; break; + case MVT::i16: Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_32; break; + case MVT::i32: Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_32; break; + case MVT::i64: Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_32; break; + case MVT::f32: Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_32; break; + case MVT::f64: Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_32; break; + } + break; + case NVPTXISD::LDUV4: + switch (RetVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_32; break; + case MVT::i16: Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_32; break; + case MVT::i32: Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_32; break; + case MVT::f32: Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_32; break; + } + break; + } + } + + SDValue Ops[] = { Op1, Chain }; + LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), &Ops[0], 2); + + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast(N)->getMemOperand(); + cast(LD)->setMemRefs(MemRefs0, MemRefs0 + 1); + + return LD; +} + + SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) { DebugLoc dl = N->getDebugLoc(); StoreSDNode *ST = cast(N); @@ -400,16 +763,6 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) { case MVT::i64: Opcode = NVPTX::ST_i64_avar; break; case MVT::f32: Opcode = NVPTX::ST_f32_avar; break; case MVT::f64: Opcode = NVPTX::ST_f64_avar; break; - case MVT::v2i8: Opcode = NVPTX::ST_v2i8_avar; break; - case MVT::v2i16: Opcode = NVPTX::ST_v2i16_avar; break; - case MVT::v2i32: Opcode = NVPTX::ST_v2i32_avar; break; - case MVT::v2i64: Opcode = NVPTX::ST_v2i64_avar; break; - case MVT::v2f32: Opcode = NVPTX::ST_v2f32_avar; break; - case MVT::v2f64: Opcode = NVPTX::ST_v2f64_avar; break; - case MVT::v4i8: Opcode = NVPTX::ST_v4i8_avar; break; - case MVT::v4i16: Opcode = NVPTX::ST_v4i16_avar; break; - case MVT::v4i32: Opcode = NVPTX::ST_v4i32_avar; break; - case MVT::v4f32: Opcode = NVPTX::ST_v4f32_avar; break; default: return NULL; } SDValue Ops[] = { N1, @@ -431,16 +784,6 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) { case MVT::i64: Opcode = NVPTX::ST_i64_asi; break; case MVT::f32: Opcode = NVPTX::ST_f32_asi; break; case MVT::f64: Opcode = NVPTX::ST_f64_asi; break; - case MVT::v2i8: Opcode = NVPTX::ST_v2i8_asi; break; - case MVT::v2i16: Opcode = NVPTX::ST_v2i16_asi; break; - case MVT::v2i32: Opcode = NVPTX::ST_v2i32_asi; break; - case MVT::v2i64: Opcode = NVPTX::ST_v2i64_asi; break; - case MVT::v2f32: Opcode = NVPTX::ST_v2f32_asi; break; - case MVT::v2f64: Opcode = NVPTX::ST_v2f64_asi; break; - case MVT::v4i8: Opcode = NVPTX::ST_v4i8_asi; break; - case MVT::v4i16: Opcode = NVPTX::ST_v4i16_asi; break; - case MVT::v4i32: Opcode = NVPTX::ST_v4i32_asi; break; - case MVT::v4f32: Opcode = NVPTX::ST_v4f32_asi; break; default: return NULL; } SDValue Ops[] = { N1, @@ -455,24 +798,26 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) { } else if (Subtarget.is64Bit()? SelectADDRri64(N2.getNode(), N2, Base, Offset): SelectADDRri(N2.getNode(), N2, Base, Offset)) { - switch (SourceVT) { - case MVT::i8: Opcode = NVPTX::ST_i8_ari; break; - case MVT::i16: Opcode = NVPTX::ST_i16_ari; break; - case MVT::i32: Opcode = NVPTX::ST_i32_ari; break; - case MVT::i64: Opcode = NVPTX::ST_i64_ari; break; - case MVT::f32: Opcode = NVPTX::ST_f32_ari; break; - case MVT::f64: Opcode = NVPTX::ST_f64_ari; break; - case MVT::v2i8: Opcode = NVPTX::ST_v2i8_ari; break; - case MVT::v2i16: Opcode = NVPTX::ST_v2i16_ari; break; - case MVT::v2i32: Opcode = NVPTX::ST_v2i32_ari; break; - case MVT::v2i64: Opcode = NVPTX::ST_v2i64_ari; break; - case MVT::v2f32: Opcode = NVPTX::ST_v2f32_ari; break; - case MVT::v2f64: Opcode = NVPTX::ST_v2f64_ari; break; - case MVT::v4i8: Opcode = NVPTX::ST_v4i8_ari; break; - case MVT::v4i16: Opcode = NVPTX::ST_v4i16_ari; break; - case MVT::v4i32: Opcode = NVPTX::ST_v4i32_ari; break; - case MVT::v4f32: Opcode = NVPTX::ST_v4f32_ari; break; - default: return NULL; + if (Subtarget.is64Bit()) { + switch (SourceVT) { + case MVT::i8: Opcode = NVPTX::ST_i8_ari_64; break; + case MVT::i16: Opcode = NVPTX::ST_i16_ari_64; break; + case MVT::i32: Opcode = NVPTX::ST_i32_ari_64; break; + case MVT::i64: Opcode = NVPTX::ST_i64_ari_64; break; + case MVT::f32: Opcode = NVPTX::ST_f32_ari_64; break; + case MVT::f64: Opcode = NVPTX::ST_f64_ari_64; break; + default: return NULL; + } + } else { + switch (SourceVT) { + case MVT::i8: Opcode = NVPTX::ST_i8_ari; break; + case MVT::i16: Opcode = NVPTX::ST_i16_ari; break; + case MVT::i32: Opcode = NVPTX::ST_i32_ari; break; + case MVT::i64: Opcode = NVPTX::ST_i64_ari; break; + case MVT::f32: Opcode = NVPTX::ST_f32_ari; break; + case MVT::f64: Opcode = NVPTX::ST_f64_ari; break; + default: return NULL; + } } SDValue Ops[] = { N1, getI32Imm(isVolatile), @@ -484,24 +829,26 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) { NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops, 9); } else { - switch (SourceVT) { - case MVT::i8: Opcode = NVPTX::ST_i8_areg; break; - case MVT::i16: Opcode = NVPTX::ST_i16_areg; break; - case MVT::i32: Opcode = NVPTX::ST_i32_areg; break; - case MVT::i64: Opcode = NVPTX::ST_i64_areg; break; - case MVT::f32: Opcode = NVPTX::ST_f32_areg; break; - case MVT::f64: Opcode = NVPTX::ST_f64_areg; break; - case MVT::v2i8: Opcode = NVPTX::ST_v2i8_areg; break; - case MVT::v2i16: Opcode = NVPTX::ST_v2i16_areg; break; - case MVT::v2i32: Opcode = NVPTX::ST_v2i32_areg; break; - case MVT::v2i64: Opcode = NVPTX::ST_v2i64_areg; break; - case MVT::v2f32: Opcode = NVPTX::ST_v2f32_areg; break; - case MVT::v2f64: Opcode = NVPTX::ST_v2f64_areg; break; - case MVT::v4i8: Opcode = NVPTX::ST_v4i8_areg; break; - case MVT::v4i16: Opcode = NVPTX::ST_v4i16_areg; break; - case MVT::v4i32: Opcode = NVPTX::ST_v4i32_areg; break; - case MVT::v4f32: Opcode = NVPTX::ST_v4f32_areg; break; - default: return NULL; + if (Subtarget.is64Bit()) { + switch (SourceVT) { + case MVT::i8: Opcode = NVPTX::ST_i8_areg_64; break; + case MVT::i16: Opcode = NVPTX::ST_i16_areg_64; break; + case MVT::i32: Opcode = NVPTX::ST_i32_areg_64; break; + case MVT::i64: Opcode = NVPTX::ST_i64_areg_64; break; + case MVT::f32: Opcode = NVPTX::ST_f32_areg_64; break; + case MVT::f64: Opcode = NVPTX::ST_f64_areg_64; break; + default: return NULL; + } + } else { + switch (SourceVT) { + case MVT::i8: Opcode = NVPTX::ST_i8_areg; break; + case MVT::i16: Opcode = NVPTX::ST_i16_areg; break; + case MVT::i32: Opcode = NVPTX::ST_i32_areg; break; + case MVT::i64: Opcode = NVPTX::ST_i64_areg; break; + case MVT::f32: Opcode = NVPTX::ST_f32_areg; break; + case MVT::f64: Opcode = NVPTX::ST_f64_areg; break; + default: return NULL; + } } SDValue Ops[] = { N1, getI32Imm(isVolatile), @@ -523,6 +870,244 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) { return NVPTXST; } +SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { + SDValue Chain = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue Addr, Offset, Base; + unsigned Opcode; + DebugLoc DL = N->getDebugLoc(); + SDNode *ST; + EVT EltVT = Op1.getValueType(); + MemSDNode *MemSD = cast(N); + EVT StoreVT = MemSD->getMemoryVT(); + + // Address Space Setting + unsigned CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget); + + if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { + report_fatal_error("Cannot store to pointer that points to constant " + "memory space"); + } + + // Volatile Setting + // - .volatile is only availalble for .global and .shared + bool IsVolatile = MemSD->isVolatile(); + if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && + CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && + CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) + IsVolatile = false; + + // Type Setting: toType + toTypeWidth + // - for integer type, always use 'u' + assert(StoreVT.isSimple() && "Store value is not simple"); + MVT ScalarVT = StoreVT.getSimpleVT().getScalarType(); + unsigned ToTypeWidth = ScalarVT.getSizeInBits(); + unsigned ToType; + if (ScalarVT.isFloatingPoint()) + ToType = NVPTX::PTXLdStInstCode::Float; + else + ToType = NVPTX::PTXLdStInstCode::Unsigned; + + + SmallVector StOps; + SDValue N2; + unsigned VecType; + + switch (N->getOpcode()) { + case NVPTXISD::StoreV2: + VecType = NVPTX::PTXLdStInstCode::V2; + StOps.push_back(N->getOperand(1)); + StOps.push_back(N->getOperand(2)); + N2 = N->getOperand(3); + break; + case NVPTXISD::StoreV4: + VecType = NVPTX::PTXLdStInstCode::V4; + StOps.push_back(N->getOperand(1)); + StOps.push_back(N->getOperand(2)); + StOps.push_back(N->getOperand(3)); + StOps.push_back(N->getOperand(4)); + N2 = N->getOperand(5); + break; + default: return NULL; + } + + StOps.push_back(getI32Imm(IsVolatile)); + StOps.push_back(getI32Imm(CodeAddrSpace)); + StOps.push_back(getI32Imm(VecType)); + StOps.push_back(getI32Imm(ToType)); + StOps.push_back(getI32Imm(ToTypeWidth)); + + if (SelectDirectAddr(N2, Addr)) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::StoreV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v2_avar; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v2_avar; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v2_avar; break; + case MVT::i64: Opcode = NVPTX::STV_i64_v2_avar; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v2_avar; break; + case MVT::f64: Opcode = NVPTX::STV_f64_v2_avar; break; + } + break; + case NVPTXISD::StoreV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v4_avar; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v4_avar; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v4_avar; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v4_avar; break; + } + break; + } + StOps.push_back(Addr); + } else if (Subtarget.is64Bit()? + SelectADDRsi64(N2.getNode(), N2, Base, Offset): + SelectADDRsi(N2.getNode(), N2, Base, Offset)) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::StoreV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v2_asi; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v2_asi; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v2_asi; break; + case MVT::i64: Opcode = NVPTX::STV_i64_v2_asi; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v2_asi; break; + case MVT::f64: Opcode = NVPTX::STV_f64_v2_asi; break; + } + break; + case NVPTXISD::StoreV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v4_asi; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v4_asi; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v4_asi; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v4_asi; break; + } + break; + } + StOps.push_back(Base); + StOps.push_back(Offset); + } else if (Subtarget.is64Bit()? + SelectADDRri64(N2.getNode(), N2, Base, Offset): + SelectADDRri(N2.getNode(), N2, Base, Offset)) { + if (Subtarget.is64Bit()) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::StoreV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v2_ari_64; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v2_ari_64; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v2_ari_64; break; + case MVT::i64: Opcode = NVPTX::STV_i64_v2_ari_64; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v2_ari_64; break; + case MVT::f64: Opcode = NVPTX::STV_f64_v2_ari_64; break; + } + break; + case NVPTXISD::StoreV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v4_ari_64; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v4_ari_64; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v4_ari_64; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v4_ari_64; break; + } + break; + } + } else { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::StoreV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v2_ari; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v2_ari; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v2_ari; break; + case MVT::i64: Opcode = NVPTX::STV_i64_v2_ari; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v2_ari; break; + case MVT::f64: Opcode = NVPTX::STV_f64_v2_ari; break; + } + break; + case NVPTXISD::StoreV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v4_ari; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v4_ari; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v4_ari; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v4_ari; break; + } + break; + } + } + StOps.push_back(Base); + StOps.push_back(Offset); + } else { + if (Subtarget.is64Bit()) { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::StoreV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v2_areg_64; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v2_areg_64; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v2_areg_64; break; + case MVT::i64: Opcode = NVPTX::STV_i64_v2_areg_64; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v2_areg_64; break; + case MVT::f64: Opcode = NVPTX::STV_f64_v2_areg_64; break; + } + break; + case NVPTXISD::StoreV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v4_areg_64; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v4_areg_64; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v4_areg_64; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v4_areg_64; break; + } + break; + } + } else { + switch (N->getOpcode()) { + default: return NULL; + case NVPTXISD::StoreV2: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v2_areg; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v2_areg; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v2_areg; break; + case MVT::i64: Opcode = NVPTX::STV_i64_v2_areg; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v2_areg; break; + case MVT::f64: Opcode = NVPTX::STV_f64_v2_areg; break; + } + break; + case NVPTXISD::StoreV4: + switch (EltVT.getSimpleVT().SimpleTy) { + default: return NULL; + case MVT::i8: Opcode = NVPTX::STV_i8_v4_areg; break; + case MVT::i16: Opcode = NVPTX::STV_i16_v4_areg; break; + case MVT::i32: Opcode = NVPTX::STV_i32_v4_areg; break; + case MVT::f32: Opcode = NVPTX::STV_f32_v4_areg; break; + } + break; + } + } + StOps.push_back(N2); + } + + StOps.push_back(Chain); + + ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, &StOps[0], StOps.size()); + + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast(N)->getMemOperand(); + cast(ST)->setMemRefs(MemRefs0, MemRefs0 + 1); + + return ST; +} + // SelectDirectAddr - Match a direct address for DAG. // A direct address could be a globaladdress or externalsymbol. bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 14f2091a3f0..4ec924117a0 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -72,8 +72,11 @@ private: #include "NVPTXGenDAGISel.inc" SDNode *Select(SDNode *N); - SDNode* SelectLoad(SDNode *N); - SDNode* SelectStore(SDNode *N); + SDNode *SelectLoad(SDNode *N); + SDNode *SelectLoadVector(SDNode *N); + SDNode *SelectLDGLDUVector(SDNode *N); + SDNode *SelectStore(SDNode *N); + SDNode *SelectStoreVector(SDNode *N); inline SDValue getI32Imm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 2699cea8f66..9ba2a1d984d 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -45,15 +45,27 @@ using namespace llvm; static unsigned int uniqueCallSite = 0; static cl::opt -RetainVectorOperands("nvptx-codegen-vectors", - cl::desc("NVPTX Specific: Retain LLVM's vectors and generate PTX vectors"), - cl::init(true)); - -static cl::opt sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); +static bool IsPTXVectorType(MVT VT) { + switch (VT.SimpleTy) { + default: return false; + case MVT::v2i8: + case MVT::v4i8: + case MVT::v2i16: + case MVT::v4i16: + case MVT::v2i32: + case MVT::v4i32: + case MVT::v2i64: + case MVT::v2f32: + case MVT::v4f32: + case MVT::v2f64: + return true; + } +} + // NVPTXTargetLowering Constructor. NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) : TargetLowering(TM, new NVPTXTargetObjectFile()), @@ -87,41 +99,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); - if (RetainVectorOperands) { - addRegisterClass(MVT::v2f32, &NVPTX::V2F32RegsRegClass); - addRegisterClass(MVT::v4f32, &NVPTX::V4F32RegsRegClass); - addRegisterClass(MVT::v2i32, &NVPTX::V2I32RegsRegClass); - addRegisterClass(MVT::v4i32, &NVPTX::V4I32RegsRegClass); - addRegisterClass(MVT::v2f64, &NVPTX::V2F64RegsRegClass); - addRegisterClass(MVT::v2i64, &NVPTX::V2I64RegsRegClass); - addRegisterClass(MVT::v2i16, &NVPTX::V2I16RegsRegClass); - addRegisterClass(MVT::v4i16, &NVPTX::V4I16RegsRegClass); - addRegisterClass(MVT::v2i8, &NVPTX::V2I8RegsRegClass); - addRegisterClass(MVT::v4i8, &NVPTX::V4I8RegsRegClass); - - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16 , Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i8 , Custom); - - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i8 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16 , Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i8 , Custom); - } - // Operations not directly supported by NVPTX. setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::Other, Expand); @@ -191,42 +168,16 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) // TRAP can be lowered to PTX trap setOperationAction(ISD::TRAP, MVT::Other, Legal); - // By default, CONCAT_VECTORS is implemented via store/load - // through stack. It is slow and uses local memory. We need - // to custom-lowering them. - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i8 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f32 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i16 , Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i8 , Custom); - - // Expand vector int to float and float to int conversions - // - For SINT_TO_FP and UINT_TO_FP, the src type - // (Node->getOperand(0).getValueType()) - // is used to determine the action, while for FP_TO_UINT and FP_TO_SINT, - // the dest type (Node->getValueType(0)) is used. - // - // See VectorLegalizer::LegalizeOp() (LegalizeVectorOps.cpp) for the vector - // case, and - // SelectionDAGLegalize::LegalizeOp() (LegalizeDAG.cpp) for the scalar case. - // - // That is why v4i32 or v2i32 are used here. - // - // The expansion for vectors happens in VectorLegalizer::LegalizeOp() - // (LegalizeVectorOps.cpp). - setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); + // Register custom handling for vector loads/stores + for (int i = MVT::FIRST_VECTOR_VALUETYPE; + i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT VT = (MVT::SimpleValueType)i; + if (IsPTXVectorType(VT)) { + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); + } + } // Now deduce the information based on the above mentioned // actions @@ -268,6 +219,14 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { case NVPTXISD::RETURN: return "NVPTXISD::RETURN"; case NVPTXISD::CallSeqBegin: return "NVPTXISD::CallSeqBegin"; case NVPTXISD::CallSeqEnd: return "NVPTXISD::CallSeqEnd"; + case NVPTXISD::LoadV2: return "NVPTXISD::LoadV2"; + case NVPTXISD::LoadV4: return "NVPTXISD::LoadV4"; + case NVPTXISD::LDGV2: return "NVPTXISD::LDGV2"; + case NVPTXISD::LDGV4: return "NVPTXISD::LDGV4"; + case NVPTXISD::LDUV2: return "NVPTXISD::LDUV2"; + case NVPTXISD::LDUV4: return "NVPTXISD::LDUV4"; + case NVPTXISD::StoreV2: return "NVPTXISD::StoreV2"; + case NVPTXISD::StoreV4: return "NVPTXISD::StoreV4"; } } @@ -868,12 +827,19 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const { } +SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() == MVT::i1) + return LowerLOADi1(Op, DAG); + else + return SDValue(); +} + // v = ld i1* addr // => // v1 = ld i8* addr // v = trunc v1 to i1 SDValue NVPTXTargetLowering:: -LowerLOAD(SDValue Op, SelectionDAG &DAG) const { +LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); LoadSDNode *LD = cast(Node); DebugLoc dl = Node->getDebugLoc(); @@ -893,12 +859,109 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(Ops, 2, dl); } +SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + EVT ValVT = Op.getOperand(1).getValueType(); + if (ValVT == MVT::i1) + return LowerSTOREi1(Op, DAG); + else if (ValVT.isVector()) + return LowerSTOREVector(Op, DAG); + else + return SDValue(); +} + +SDValue +NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { + SDNode *N = Op.getNode(); + SDValue Val = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + EVT ValVT = Val.getValueType(); + + if (ValVT.isVector()) { + // We only handle "native" vector sizes for now, e.g. <4 x double> is not + // legal. We can (and should) split that into 2 stores of <2 x double> here + // but I'm leaving that as a TODO for now. + if (!ValVT.isSimple()) + return SDValue(); + switch (ValVT.getSimpleVT().SimpleTy) { + default: return SDValue(); + case MVT::v2i8: + case MVT::v2i16: + case MVT::v2i32: + case MVT::v2i64: + case MVT::v2f32: + case MVT::v2f64: + case MVT::v4i8: + case MVT::v4i16: + case MVT::v4i32: + case MVT::v4f32: + // This is a "native" vector type + break; + } + + unsigned Opcode = 0; + EVT EltVT = ValVT.getVectorElementType(); + unsigned NumElts = ValVT.getVectorNumElements(); + + // Since StoreV2 is a target node, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // stored type to i16 and propogate the "real" type as the memory type. + bool NeedExt = false; + if (EltVT.getSizeInBits() < 16) + NeedExt = true; + + switch (NumElts) { + default: return SDValue(); + case 2: + Opcode = NVPTXISD::StoreV2; + break; + case 4: { + Opcode = NVPTXISD::StoreV4; + break; + } + } + + SmallVector Ops; + + // First is the chain + Ops.push_back(N->getOperand(0)); + + // Then the split values + for (unsigned i = 0; i < NumElts; ++i) { + SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, + DAG.getIntPtrConstant(i)); + if (NeedExt) + // ANY_EXTEND is correct here since the store will only look at the + // lower-order bits anyway. + ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); + Ops.push_back(ExtVal); + } + + // Then any remaining arguments + for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) { + Ops.push_back(N->getOperand(i)); + } + + MemSDNode *MemSD = cast(N); + + SDValue NewSt = DAG.getMemIntrinsicNode(Opcode, DL, + DAG.getVTList(MVT::Other), &Ops[0], + Ops.size(), MemSD->getMemoryVT(), + MemSD->getMemOperand()); + + + //return DCI.CombineTo(N, NewSt, true); + return NewSt; + } + + return SDValue(); +} + // st i1 v, addr // => // v1 = zxt v to i8 // st i8, addr SDValue NVPTXTargetLowering:: -LowerSTORE(SDValue Op, SelectionDAG &DAG) const { +LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); DebugLoc dl = Node->getDebugLoc(); StoreSDNode *ST = cast(Node); @@ -1348,3 +1411,242 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { return 4; } + +/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. +static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl& Results) { + EVT ResVT = N->getValueType(0); + DebugLoc DL = N->getDebugLoc(); + + assert(ResVT.isVector() && "Vector load must have vector type"); + + // We only handle "native" vector sizes for now, e.g. <4 x double> is not + // legal. We can (and should) split that into 2 loads of <2 x double> here + // but I'm leaving that as a TODO for now. + assert(ResVT.isSimple() && "Can only handle simple types"); + switch (ResVT.getSimpleVT().SimpleTy) { + default: return; + case MVT::v2i8: + case MVT::v2i16: + case MVT::v2i32: + case MVT::v2i64: + case MVT::v2f32: + case MVT::v2f64: + case MVT::v4i8: + case MVT::v4i16: + case MVT::v4i32: + case MVT::v4f32: + // This is a "native" vector type + break; + } + + EVT EltVT = ResVT.getVectorElementType(); + unsigned NumElts = ResVT.getVectorNumElements(); + + // Since LoadV2 is a target node, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // loaded type to i16 and propogate the "real" type as the memory type. + bool NeedTrunc = false; + if (EltVT.getSizeInBits() < 16) { + EltVT = MVT::i16; + NeedTrunc = true; + } + + unsigned Opcode = 0; + SDVTList LdResVTs; + + switch (NumElts) { + default: return; + case 2: + Opcode = NVPTXISD::LoadV2; + LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); + break; + case 4: { + Opcode = NVPTXISD::LoadV4; + EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; + LdResVTs = DAG.getVTList(ListVTs, 5); + break; + } + } + + SmallVector OtherOps; + + // Copy regular operands + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + OtherOps.push_back(N->getOperand(i)); + + LoadSDNode *LD = cast(N); + + // The select routine does not have access to the LoadSDNode instance, so + // pass along the extension information + OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType())); + + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, &OtherOps[0], + OtherOps.size(), LD->getMemoryVT(), + LD->getMemOperand()); + + SmallVector ScalarRes; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewLD.getValue(i); + if (NeedTrunc) + Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); + ScalarRes.push_back(Res); + } + + SDValue LoadChain = NewLD.getValue(NumElts); + + SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts); + + Results.push_back(BuildVec); + Results.push_back(LoadChain); +} + +static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, + SelectionDAG &DAG, + SmallVectorImpl &Results) { + SDValue Chain = N->getOperand(0); + SDValue Intrin = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Get the intrinsic ID + unsigned IntrinNo = cast(Intrin.getNode())->getZExtValue(); + switch(IntrinNo) { + default: return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: { + EVT ResVT = N->getValueType(0); + + if (ResVT.isVector()) { + // Vector LDG/LDU + + unsigned NumElts = ResVT.getVectorNumElements(); + EVT EltVT = ResVT.getVectorElementType(); + + // Since LDU/LDG are target nodes, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // loaded type to i16 and propogate the "real" type as the memory type. + bool NeedTrunc = false; + if (EltVT.getSizeInBits() < 16) { + EltVT = MVT::i16; + NeedTrunc = true; + } + + unsigned Opcode = 0; + SDVTList LdResVTs; + + switch (NumElts) { + default: return; + case 2: + switch(IntrinNo) { + default: return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + Opcode = NVPTXISD::LDGV2; + break; + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: + Opcode = NVPTXISD::LDUV2; + break; + } + LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); + break; + case 4: { + switch(IntrinNo) { + default: return; + case Intrinsic::nvvm_ldg_global_i: + case Intrinsic::nvvm_ldg_global_f: + case Intrinsic::nvvm_ldg_global_p: + Opcode = NVPTXISD::LDGV4; + break; + case Intrinsic::nvvm_ldu_global_i: + case Intrinsic::nvvm_ldu_global_f: + case Intrinsic::nvvm_ldu_global_p: + Opcode = NVPTXISD::LDUV4; + break; + } + EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; + LdResVTs = DAG.getVTList(ListVTs, 5); + break; + } + } + + SmallVector OtherOps; + + // Copy regular operands + + OtherOps.push_back(Chain); // Chain + // Skip operand 1 (intrinsic ID) + // Others + for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) + OtherOps.push_back(N->getOperand(i)); + + MemIntrinsicSDNode *MemSD = cast(N); + + SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, &OtherOps[0], + OtherOps.size(), MemSD->getMemoryVT(), + MemSD->getMemOperand()); + + SmallVector ScalarRes; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewLD.getValue(i); + if (NeedTrunc) + Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); + ScalarRes.push_back(Res); + } + + SDValue LoadChain = NewLD.getValue(NumElts); + + SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts); + + Results.push_back(BuildVec); + Results.push_back(LoadChain); + } else { + // i8 LDG/LDU + assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && + "Custom handling of non-i8 ldu/ldg?"); + + // Just copy all operands as-is + SmallVector Ops; + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + Ops.push_back(N->getOperand(i)); + + // Force output to i16 + SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); + + MemIntrinsicSDNode *MemSD = cast(N); + + // We make sure the memory type is i8, which will be used during isel + // to select the proper instruction. + SDValue NewLD = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, + LdResVTs, &Ops[0], + Ops.size(), MVT::i8, + MemSD->getMemOperand()); + + Results.push_back(NewLD.getValue(0)); + Results.push_back(NewLD.getValue(1)); + } + } + } +} + +void NVPTXTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: report_fatal_error("Unhandled custom legalization"); + case ISD::LOAD: + ReplaceLoadVector(N, DAG, Results); + return; + case ISD::INTRINSIC_W_CHAIN: + ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); + return; + } +} diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index 0a1833a7c9d..95e7b55c48d 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -58,7 +58,16 @@ enum NodeType { RETURN, CallSeqBegin, CallSeqEnd, - Dummy + Dummy, + + LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE, + LoadV4, + LDGV2, // LDG.v2 + LDGV4, // LDG.v4 + LDUV2, // LDU.v2 + LDUV4, // LDU.v4 + StoreV2, + StoreV4 }; } @@ -143,8 +152,16 @@ private: SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const; + + virtual void ReplaceNodeResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const; }; } // namespace llvm diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 6fe654cb49d..9e73d80c285 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -65,46 +65,6 @@ void NVPTXInstrInfo::copyPhysReg (MachineBasicBlock &MBB, NVPTX::Float64RegsRegClass.contains(SrcReg)) BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V4F32RegsRegClass.contains(DestReg) && - NVPTX::V4F32RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V4f32Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V4I32RegsRegClass.contains(DestReg) && - NVPTX::V4I32RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V4i32Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V2F32RegsRegClass.contains(DestReg) && - NVPTX::V2F32RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V2f32Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V2I32RegsRegClass.contains(DestReg) && - NVPTX::V2I32RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V2i32Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V4I8RegsRegClass.contains(DestReg) && - NVPTX::V4I8RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V4i8Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V2I8RegsRegClass.contains(DestReg) && - NVPTX::V2I8RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V2i8Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V4I16RegsRegClass.contains(DestReg) && - NVPTX::V4I16RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V4i16Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V2I16RegsRegClass.contains(DestReg) && - NVPTX::V2I16RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V2i16Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V2I64RegsRegClass.contains(DestReg) && - NVPTX::V2I64RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V2i64Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (NVPTX::V2F64RegsRegClass.contains(DestReg) && - NVPTX::V2F64RegsRegClass.contains(SrcReg)) - BuildMI(MBB, I, DL, get(NVPTX::V2f64Mov), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); else { llvm_unreachable("Don't know how to copy a register"); } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 8a410b87292..f43abe283b5 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -52,6 +52,7 @@ def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">; def hasVote : Predicate<"Subtarget.hasVote()">; def hasDouble : Predicate<"Subtarget.hasDouble()">; def reqPTX20 : Predicate<"Subtarget.reqPTX20()">; +def hasLDG : Predicate<"Subtarget.hasLDG()">; def hasLDU : Predicate<"Subtarget.hasLDU()">; def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">; @@ -2153,11 +2154,21 @@ multiclass LD { i32imm:$fromWidth, Int32Regs:$addr), !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t$dst, [$addr];"), []>; + def _areg_64 : NVPTXInst<(outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth", + " \t$dst, [$addr];"), []>; def _ari : NVPTXInst<(outs regclass:$dst), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t$dst, [$addr+$offset];"), []>; + def _ari_64 : NVPTXInst<(outs regclass:$dst), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth", + " \t$dst, [$addr+$offset];"), []>; def _asi : NVPTXInst<(outs regclass:$dst), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), @@ -2174,19 +2185,6 @@ defm LD_f32 : LD; defm LD_f64 : LD; } -let VecInstType=isVecLD.Value, mayLoad=1, neverHasSideEffects=1 in { -defm LD_v2i8 : LD; -defm LD_v4i8 : LD; -defm LD_v2i16 : LD; -defm LD_v4i16 : LD; -defm LD_v2i32 : LD; -defm LD_v4i32 : LD; -defm LD_v2f32 : LD; -defm LD_v4f32 : LD; -defm LD_v2i64 : LD; -defm LD_v2f64 : LD; -} - multiclass ST { def _avar : NVPTXInst<(outs), (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, @@ -2198,11 +2196,21 @@ multiclass ST { LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr), !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", " \t[$addr], $src;"), []>; + def _areg_64 : NVPTXInst<(outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ", + "\t[$addr], $src;"), []>; def _ari : NVPTXInst<(outs), (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset), !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth", " \t[$addr+$offset], $src;"), []>; + def _ari_64 : NVPTXInst<(outs), + (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ", + "\t[$addr+$offset], $src;"), []>; def _asi : NVPTXInst<(outs), (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset), @@ -2219,19 +2227,6 @@ defm ST_f32 : ST; defm ST_f64 : ST; } -let VecInstType=isVecST.Value, mayStore=1, neverHasSideEffects=1 in { -defm ST_v2i8 : ST; -defm ST_v4i8 : ST; -defm ST_v2i16 : ST; -defm ST_v4i16 : ST; -defm ST_v2i32 : ST; -defm ST_v4i32 : ST; -defm ST_v2f32 : ST; -defm ST_v4f32 : ST; -defm ST_v2i64 : ST; -defm ST_v2f64 : ST; -} - // The following is used only in and after vector elementizations. // Vector elementization happens at the machine instruction level, so the // following instruction @@ -2247,11 +2242,21 @@ multiclass LD_VEC { i32imm:$fromWidth, Int32Regs:$addr), !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>; + def _v2_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>; def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>; + def _v2_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>; def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset), @@ -2269,6 +2274,12 @@ multiclass LD_VEC { i32imm:$fromWidth, Int32Regs:$addr), !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>; + def _v4_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, + regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>; def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, @@ -2276,6 +2287,13 @@ multiclass LD_VEC { !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"), []>; + def _v4_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, + regclass:$dst3, regclass:$dst4), + (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"), + []>; def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, @@ -2304,12 +2322,23 @@ multiclass ST_VEC { LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr), !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t[$addr], {{$src1, $src2}};"), []>; + def _v2_areg_64 : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr], {{$src1, $src2}};"), []>; def _v2_ari : NVPTXInst<(outs), (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset), !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>; + def _v2_ari_64 : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, + LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, + i32imm:$offset), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>; def _v2_asi : NVPTXInst<(outs), (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, @@ -2328,6 +2357,12 @@ multiclass ST_VEC { i32imm:$fromWidth, Int32Regs:$addr), !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>; + def _v4_areg_64 : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>; def _v4_ari : NVPTXInst<(outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, @@ -2335,6 +2370,13 @@ multiclass ST_VEC { !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"), []>; + def _v4_ari_64 : NVPTXInst<(outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset), + !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}", + "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"), + []>; def _v4_asi : NVPTXInst<(outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, @@ -2822,8 +2864,6 @@ def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>; -include "NVPTXVector.td" - include "NVPTXIntrinsics.td" diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 028a94bfd1b..49e2568dfa2 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1343,52 +1343,113 @@ defm INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>; -// Vector ldu -multiclass VLDU_G { - def _32: NVPTXVecInst<(outs regclass:$result), (ins Int32Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int32Regs:$src))], eleInst>, - Requires<[hasLDU]>; - def _64: NVPTXVecInst<(outs regclass:$result), (ins Int64Regs:$src), - !strconcat("ldu.global.", TyStr), - [(set regclass:$result, (IntOp Int64Regs:$src))], eleInst64>, - Requires<[hasLDU]>; + +//----------------------------------- +// Support for ldg on sm_35 or later +//----------------------------------- + +def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{ + MemIntrinsicSDNode *M = cast(N); + return M->getMemoryVT() == MVT::i8; +}]>; + +multiclass LDG_G { + def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; + def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, + Requires<[hasLDG]>; + def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; + def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; +} + +multiclass LDG_G_NOINTRIN { + def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>; + def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, + Requires<[hasLDG]>; + def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>; + def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), + !strconcat("ld.global.nc.", TyStr), + [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>; +} + +defm INT_PTX_LDG_GLOBAL_i8 + : LDG_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, ldg_i8>; +defm INT_PTX_LDG_GLOBAL_i16 + : LDG_G<"u16 \t$result, [$src];", Int16Regs, int_nvvm_ldg_global_i>; +defm INT_PTX_LDG_GLOBAL_i32 + : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_i>; +defm INT_PTX_LDG_GLOBAL_i64 + : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_i>; +defm INT_PTX_LDG_GLOBAL_f32 + : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>; +defm INT_PTX_LDG_GLOBAL_f64 + : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>; +defm INT_PTX_LDG_GLOBAL_p32 + : LDG_G<"u32 \t$result, [$src];", Int32Regs, int_nvvm_ldg_global_p>; +defm INT_PTX_LDG_GLOBAL_p64 + : LDG_G<"u64 \t$result, [$src];", Int64Regs, int_nvvm_ldg_global_p>; + +// vector + +// Elementized vector ldg +multiclass VLDG_G_ELE_V2 { + def _32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins Int32Regs:$src), + !strconcat("ld.global.nc.", TyStr), []>; + def _64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2), + (ins Int64Regs:$src), + !strconcat("ld.global.nc.", TyStr), []>; } -let VecInstType=isVecLD.Value in { -defm INT_PTX_LDU_G_v2i8 : VLDU_G<"v2.u8 \t${result:vecfull}, [$src];", - V2I8Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i8_ELE_32, - INT_PTX_LDU_G_v2i8_ELE_64>; -defm INT_PTX_LDU_G_v4i8 : VLDU_G<"v4.u8 \t${result:vecfull}, [$src];", - V4I8Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i8_ELE_32, - INT_PTX_LDU_G_v4i8_ELE_64>; -defm INT_PTX_LDU_G_v2i16 : VLDU_G<"v2.u16 \t${result:vecfull}, [$src];", - V2I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i16_ELE_32, - INT_PTX_LDU_G_v2i16_ELE_64>; -defm INT_PTX_LDU_G_v4i16 : VLDU_G<"v4.u16 \t${result:vecfull}, [$src];", - V4I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i16_ELE_32, - INT_PTX_LDU_G_v4i16_ELE_64>; -defm INT_PTX_LDU_G_v2i32 : VLDU_G<"v2.u32 \t${result:vecfull}, [$src];", - V2I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i32_ELE_32, - INT_PTX_LDU_G_v2i32_ELE_64>; -defm INT_PTX_LDU_G_v4i32 : VLDU_G<"v4.u32 \t${result:vecfull}, [$src];", - V4I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i32_ELE_32, - INT_PTX_LDU_G_v4i32_ELE_64>; -defm INT_PTX_LDU_G_v2f32 : VLDU_G<"v2.f32 \t${result:vecfull}, [$src];", - V2F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f32_ELE_32, - INT_PTX_LDU_G_v2f32_ELE_64>; -defm INT_PTX_LDU_G_v4f32 : VLDU_G<"v4.f32 \t${result:vecfull}, [$src];", - V4F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v4f32_ELE_32, - INT_PTX_LDU_G_v4f32_ELE_64>; -defm INT_PTX_LDU_G_v2i64 : VLDU_G<"v2.u64 \t${result:vecfull}, [$src];", - V2I64Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i64_ELE_32, - INT_PTX_LDU_G_v2i64_ELE_64>; -defm INT_PTX_LDU_G_v2f64 : VLDU_G<"v2.f64 \t${result:vecfull}, [$src];", - V2F64Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f64_ELE_32, - INT_PTX_LDU_G_v2f64_ELE_64>; +multiclass VLDG_G_ELE_V4 { + def _32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, + regclass:$dst3, regclass:$dst4), (ins Int32Regs:$src), + !strconcat("ld.global.nc.", TyStr), []>; + def _64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, + regclass:$dst3, regclass:$dst4), (ins Int64Regs:$src), + !strconcat("ld.global.nc.", TyStr), []>; } +// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. +defm INT_PTX_LDG_G_v2i8_ELE + : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>; +defm INT_PTX_LDG_G_v2i16_ELE + : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>; +defm INT_PTX_LDG_G_v2i32_ELE + : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>; +defm INT_PTX_LDG_G_v2f32_ELE + : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>; +defm INT_PTX_LDG_G_v2i64_ELE + : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>; +defm INT_PTX_LDG_G_v2f64_ELE + : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>; +defm INT_PTX_LDG_G_v4i8_ELE + : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>; +defm INT_PTX_LDG_G_v4i16_ELE + : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>; +defm INT_PTX_LDG_G_v4i32_ELE + : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>; +defm INT_PTX_LDG_G_v4f32_ELE + : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>; multiclass NG_TO_G { diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index e0c9161f31f..8e105b50289 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -54,36 +54,6 @@ std::string getNVPTXRegClassName (TargetRegisterClass const *RC) { else if (RC == &NVPTX::SpecialRegsRegClass) { return "!Special!"; } - else if (RC == &NVPTX::V2F32RegsRegClass) { - return ".v2.f32"; - } - else if (RC == &NVPTX::V4F32RegsRegClass) { - return ".v4.f32"; - } - else if (RC == &NVPTX::V2I32RegsRegClass) { - return ".v2.s32"; - } - else if (RC == &NVPTX::V4I32RegsRegClass) { - return ".v4.s32"; - } - else if (RC == &NVPTX::V2F64RegsRegClass) { - return ".v2.f64"; - } - else if (RC == &NVPTX::V2I64RegsRegClass) { - return ".v2.s64"; - } - else if (RC == &NVPTX::V2I16RegsRegClass) { - return ".v2.s16"; - } - else if (RC == &NVPTX::V4I16RegsRegClass) { - return ".v4.s16"; - } - else if (RC == &NVPTX::V2I8RegsRegClass) { - return ".v2.s16"; - } - else if (RC == &NVPTX::V4I8RegsRegClass) { - return ".v4.s16"; - } else { return "INTERNAL"; } @@ -115,137 +85,11 @@ std::string getNVPTXRegClassStr (TargetRegisterClass const *RC) { else if (RC == &NVPTX::SpecialRegsRegClass) { return "!Special!"; } - else if (RC == &NVPTX::V2F32RegsRegClass) { - return "%v2f"; - } - else if (RC == &NVPTX::V4F32RegsRegClass) { - return "%v4f"; - } - else if (RC == &NVPTX::V2I32RegsRegClass) { - return "%v2r"; - } - else if (RC == &NVPTX::V4I32RegsRegClass) { - return "%v4r"; - } - else if (RC == &NVPTX::V2F64RegsRegClass) { - return "%v2fd"; - } - else if (RC == &NVPTX::V2I64RegsRegClass) { - return "%v2rd"; - } - else if (RC == &NVPTX::V2I16RegsRegClass) { - return "%v2s"; - } - else if (RC == &NVPTX::V4I16RegsRegClass) { - return "%v4rs"; - } - else if (RC == &NVPTX::V2I8RegsRegClass) { - return "%v2rc"; - } - else if (RC == &NVPTX::V4I8RegsRegClass) { - return "%v4rc"; - } else { return "INTERNAL"; } return ""; } - -bool isNVPTXVectorRegClass(TargetRegisterClass const *RC) { - if (RC->getID() == NVPTX::V2F32RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V2F64RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V2I16RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V2I32RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V2I64RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V2I8RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V4F32RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V4I16RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V4I32RegsRegClassID) - return true; - if (RC->getID() == NVPTX::V4I8RegsRegClassID) - return true; - return false; -} - -std::string getNVPTXElemClassName(TargetRegisterClass const *RC) { - if (RC->getID() == NVPTX::V2F32RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass); - if (RC->getID() == NVPTX::V2F64RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Float64RegsRegClass); - if (RC->getID() == NVPTX::V2I16RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass); - if (RC->getID() == NVPTX::V2I32RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass); - if (RC->getID() == NVPTX::V2I64RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Int64RegsRegClass); - if (RC->getID() == NVPTX::V2I8RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass); - if (RC->getID() == NVPTX::V4F32RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass); - if (RC->getID() == NVPTX::V4I16RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass); - if (RC->getID() == NVPTX::V4I32RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass); - if (RC->getID() == NVPTX::V4I8RegsRegClassID) - return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass); - llvm_unreachable("Not a vector register class"); -} - -const TargetRegisterClass *getNVPTXElemClass(TargetRegisterClass const *RC) { - if (RC->getID() == NVPTX::V2F32RegsRegClassID) - return (&NVPTX::Float32RegsRegClass); - if (RC->getID() == NVPTX::V2F64RegsRegClassID) - return (&NVPTX::Float64RegsRegClass); - if (RC->getID() == NVPTX::V2I16RegsRegClassID) - return (&NVPTX::Int16RegsRegClass); - if (RC->getID() == NVPTX::V2I32RegsRegClassID) - return (&NVPTX::Int32RegsRegClass); - if (RC->getID() == NVPTX::V2I64RegsRegClassID) - return (&NVPTX::Int64RegsRegClass); - if (RC->getID() == NVPTX::V2I8RegsRegClassID) - return (&NVPTX::Int8RegsRegClass); - if (RC->getID() == NVPTX::V4F32RegsRegClassID) - return (&NVPTX::Float32RegsRegClass); - if (RC->getID() == NVPTX::V4I16RegsRegClassID) - return (&NVPTX::Int16RegsRegClass); - if (RC->getID() == NVPTX::V4I32RegsRegClassID) - return (&NVPTX::Int32RegsRegClass); - if (RC->getID() == NVPTX::V4I8RegsRegClassID) - return (&NVPTX::Int8RegsRegClass); - llvm_unreachable("Not a vector register class"); -} - -int getNVPTXVectorSize(TargetRegisterClass const *RC) { - if (RC->getID() == NVPTX::V2F32RegsRegClassID) - return 2; - if (RC->getID() == NVPTX::V2F64RegsRegClassID) - return 2; - if (RC->getID() == NVPTX::V2I16RegsRegClassID) - return 2; - if (RC->getID() == NVPTX::V2I32RegsRegClassID) - return 2; - if (RC->getID() == NVPTX::V2I64RegsRegClassID) - return 2; - if (RC->getID() == NVPTX::V2I8RegsRegClassID) - return 2; - if (RC->getID() == NVPTX::V4F32RegsRegClassID) - return 4; - if (RC->getID() == NVPTX::V4I16RegsRegClassID) - return 4; - if (RC->getID() == NVPTX::V4I32RegsRegClassID) - return 4; - if (RC->getID() == NVPTX::V4I8RegsRegClassID) - return 4; - llvm_unreachable("Not a vector register class"); -} } NVPTXRegisterInfo::NVPTXRegisterInfo(const TargetInstrInfo &tii, diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h index a3e1252b494..56e6289b1bb 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.h +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h @@ -81,10 +81,6 @@ public: std::string getNVPTXRegClassName (const TargetRegisterClass *RC); std::string getNVPTXRegClassStr (const TargetRegisterClass *RC); -bool isNVPTXVectorRegClass (const TargetRegisterClass *RC); -std::string getNVPTXElemClassName (const TargetRegisterClass *RC); -int getNVPTXVectorSize (const TargetRegisterClass *RC); -const TargetRegisterClass *getNVPTXElemClass(const TargetRegisterClass *RC); } // end namespace llvm diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td index ba158258b99..8d100d63168 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -37,9 +37,6 @@ foreach i = 0-395 in { def RL#i : NVPTXReg<"%rl"#i>; // 64-bit def F#i : NVPTXReg<"%f"#i>; // 32-bit float def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float - // Vectors - foreach s = [ "2b8", "2b16", "2b32", "2b64", "4b8", "4b16", "4b32" ] in - def v#s#_#i : NVPTXReg<"%v"#s#"_"#i>; // Arguments def ia#i : NVPTXReg<"%ia"#i>; @@ -65,44 +62,3 @@ def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 395))>; // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>; - -class NVPTXVecRegClass regTypes, int alignment, dag regList, - NVPTXRegClass sClass, - int e, - string n> - : NVPTXRegClass -{ - NVPTXRegClass scalarClass=sClass; - int elems=e; - string name=n; -} -def V2F32Regs - : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%u", 0, 395)), - Float32Regs, 2, ".v2.f32">; -def V4F32Regs - : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%u", 0, 395)), - Float32Regs, 4, ".v4.f32">; -def V2I32Regs - : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%u", 0, 395)), - Int32Regs, 2, ".v2.u32">; -def V4I32Regs - : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%u", 0, 395)), - Int32Regs, 4, ".v4.u32">; -def V2F64Regs - : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%u", 0, 395)), - Float64Regs, 2, ".v2.f64">; -def V2I64Regs - : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%u", 0, 395)), - Int64Regs, 2, ".v2.u64">; -def V2I16Regs - : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%u", 0, 395)), - Int16Regs, 2, ".v2.u16">; -def V4I16Regs - : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%u", 0, 395)), - Int16Regs, 4, ".v4.u16">; -def V2I8Regs - : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%u", 0, 395)), - Int8Regs, 2, ".v2.u8">; -def V4I8Regs - : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%u", 0, 395)), - Int8Regs, 4, ".v4.u8">; diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h index e6cb7c2def6..beea77e38d8 100644 --- a/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/lib/Target/NVPTX/NVPTXSubtarget.h @@ -57,6 +57,7 @@ public: bool hasF32FTZ() const { return SmVersion >= 20; } bool hasFMAF32() const { return SmVersion >= 20; } bool hasFMAF64() const { return SmVersion >= 13; } + bool hasLDG() const { return SmVersion >= 32; } bool hasLDU() const { return SmVersion >= 20; } bool hasGenericLdSt() const { return SmVersion >= 20; } inline bool hasHWROT32() const { return false; } diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index b4e049ea3e5..cd765fa8cb1 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -123,7 +123,6 @@ bool NVPTXPassConfig::addInstSelector() { addPass(createSplitBBatBarPass()); addPass(createAllocaHoisting()); addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); - addPass(createVectorElementizePass(getNVPTXTargetMachine())); return false; } diff --git a/lib/Target/NVPTX/VectorElementize.cpp b/lib/Target/NVPTX/VectorElementize.cpp deleted file mode 100644 index f1b285dd787..00000000000 --- a/lib/Target/NVPTX/VectorElementize.cpp +++ /dev/null @@ -1,1239 +0,0 @@ -//===-- VectorElementize.cpp - Remove unreachable blocks for codegen --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass converts operations on vector types to operations on their -// element types. -// -// For generic binary and unary vector instructions, the conversion is simple. -// Suppose we have -// av = bv Vop cv -// where av, bv, and cv are vector virtual registers, and Vop is a vector op. -// This gets converted to the following : -// a1 = b1 Sop c1 -// a2 = b2 Sop c2 -// -// VectorToScalarMap maintains the vector vreg to scalar vreg mapping. -// For the above example, the map will look as follows: -// av => [a1, a2] -// bv => [b1, b2] -// -// In addition, initVectorInfo creates the following opcode->opcode map. -// Vop => Sop -// OtherVop => OtherSop -// ... -// -// For vector specific instructions like vecbuild, vecshuffle etc, the -// conversion is different. Look at comments near the functions with -// prefix createVec<...>. -// -//===----------------------------------------------------------------------===// - -#include "NVPTX.h" -#include "NVPTXTargetMachine.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Constant.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Type.h" -#include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Target/TargetInstrInfo.h" - -using namespace llvm; - -namespace { - -class LLVM_LIBRARY_VISIBILITY VectorElementize : public MachineFunctionPass { - virtual bool runOnMachineFunction(MachineFunction &F); - - NVPTXTargetMachine &TM; - MachineRegisterInfo *MRI; - const NVPTXRegisterInfo *RegInfo; - const NVPTXInstrInfo *InstrInfo; - - llvm::DenseMap - RegClassMap; - llvm::DenseMap SimpleMoveMap; - - llvm::DenseMap > VectorToScalarMap; - - bool isVectorInstr(MachineInstr *); - - SmallVector getScalarRegisters(unsigned); - unsigned getScalarVersion(unsigned); - unsigned getScalarVersion(MachineInstr *); - - bool isVectorRegister(unsigned); - const TargetRegisterClass *getScalarRegClass(const TargetRegisterClass *RC); - unsigned numCopiesNeeded(MachineInstr *); - - void createLoadCopy(MachineFunction&, MachineInstr *, - std::vector&); - void createStoreCopy(MachineFunction&, MachineInstr *, - std::vector&); - - void createVecDest(MachineFunction&, MachineInstr *, - std::vector&); - - void createCopies(MachineFunction&, MachineInstr *, - std::vector&); - - unsigned copyProp(MachineFunction&); - unsigned removeDeadMoves(MachineFunction&); - - void elementize(MachineFunction&); - - bool isSimpleMove(MachineInstr *); - - void createVecShuffle(MachineFunction& F, MachineInstr *Instr, - std::vector& copies); - - void createVecExtract(MachineFunction& F, MachineInstr *Instr, - std::vector& copies); - - void createVecInsert(MachineFunction& F, MachineInstr *Instr, - std::vector& copies); - - void createVecBuild(MachineFunction& F, MachineInstr *Instr, - std::vector& copies); - -public: - - static char ID; // Pass identification, replacement for typeid - VectorElementize(NVPTXTargetMachine &tm) - : MachineFunctionPass(ID), TM(tm) {} - - virtual const char *getPassName() const { - return "Convert LLVM vector types to their element types"; - } -}; - -char VectorElementize::ID = 1; -} - -static cl::opt -RemoveRedundantMoves("nvptx-remove-redundant-moves", - cl::desc("NVPTX: Remove redundant moves introduced by vector lowering"), - cl::init(true)); - -#define VECINST(x) ((((x)->getDesc().TSFlags) & NVPTX::VecInstTypeMask) \ - >> NVPTX::VecInstTypeShift) -#define ISVECINST(x) (VECINST(x) != NVPTX::VecNOP) -#define ISVECLOAD(x) (VECINST(x) == NVPTX::VecLoad) -#define ISVECSTORE(x) (VECINST(x) == NVPTX::VecStore) -#define ISVECBUILD(x) (VECINST(x) == NVPTX::VecBuild) -#define ISVECSHUFFLE(x) (VECINST(x) == NVPTX::VecShuffle) -#define ISVECEXTRACT(x) (VECINST(x) == NVPTX::VecExtract) -#define ISVECINSERT(x) (VECINST(x) == NVPTX::VecInsert) -#define ISVECDEST(x) (VECINST(x) == NVPTX::VecDest) - -bool VectorElementize::isSimpleMove(MachineInstr *mi) { - if (mi->isCopy()) - return true; - unsigned TSFlags = (mi->getDesc().TSFlags & NVPTX::SimpleMoveMask) - >> NVPTX::SimpleMoveShift; - return (TSFlags == 1); -} - -bool VectorElementize::isVectorInstr(MachineInstr *mi) { - if ((mi->getOpcode() == NVPTX::PHI) || - (mi->getOpcode() == NVPTX::IMPLICIT_DEF) || mi->isCopy()) { - MachineOperand dest = mi->getOperand(0); - return isVectorRegister(dest.getReg()); - } - return ISVECINST(mi); -} - -unsigned VectorElementize::getScalarVersion(MachineInstr *mi) { - return getScalarVersion(mi->getOpcode()); -} - -///============================================================================= -///Instr is assumed to be a vector instruction. For most vector instructions, -///the size of the destination vector register gives the number of scalar copies -///needed. For VecStore, size of getOperand(1) gives the number of scalar copies -///needed. For VecExtract, the dest is a scalar. So getOperand(1) gives the -///number of scalar copies needed. -///============================================================================= -unsigned VectorElementize::numCopiesNeeded(MachineInstr *Instr) { - unsigned numDefs=0; - unsigned def; - for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) { - MachineOperand oper = Instr->getOperand(i); - - if (!oper.isReg()) continue; - if (!oper.isDef()) continue; - def = i; - numDefs++; - } - assert((numDefs <= 1) && "Only 0 or 1 defs supported"); - - if (numDefs == 1) { - unsigned regnum = Instr->getOperand(def).getReg(); - if (ISVECEXTRACT(Instr)) - regnum = Instr->getOperand(1).getReg(); - return getNVPTXVectorSize(MRI->getRegClass(regnum)); - } - else if (numDefs == 0) { - assert(ISVECSTORE(Instr) - && "Only 0 def instruction supported is vector store"); - - unsigned regnum = Instr->getOperand(0).getReg(); - return getNVPTXVectorSize(MRI->getRegClass(regnum)); - } - return 1; -} - -const TargetRegisterClass *VectorElementize:: -getScalarRegClass(const TargetRegisterClass *RC) { - assert(isNVPTXVectorRegClass(RC) && - "Not a vector register class"); - return getNVPTXElemClass(RC); -} - -bool VectorElementize::isVectorRegister(unsigned reg) { - const TargetRegisterClass *RC=MRI->getRegClass(reg); - return isNVPTXVectorRegClass(RC); -} - -///============================================================================= -///For every vector register 'v' that is not already in the VectorToScalarMap, -///create n scalar registers of the corresponding element type, where n -///is 2 or 4 (getNVPTXVectorSize) and add it VectorToScalarMap. -///============================================================================= -SmallVector VectorElementize::getScalarRegisters(unsigned regnum) { - assert(isVectorRegister(regnum) && "Expecting a vector register here"); - // Create the scalar registers and put them in the map, if not already there. - if (VectorToScalarMap.find(regnum) == VectorToScalarMap.end()) { - const TargetRegisterClass *vecClass = MRI->getRegClass(regnum); - const TargetRegisterClass *scalarClass = getScalarRegClass(vecClass); - - SmallVector temp; - - for (unsigned i=0, e=getNVPTXVectorSize(vecClass); i!=e; ++i) - temp.push_back(MRI->createVirtualRegister(scalarClass)); - - VectorToScalarMap[regnum] = temp; - } - return VectorToScalarMap[regnum]; -} - -///============================================================================= -///For a vector load of the form -///va <= ldv2 [addr] -///the following multi output instruction is created : -///[v1, v2] <= LD [addr] -///Look at NVPTXVector.td for the definitions of multi output loads. -///============================================================================= -void VectorElementize::createLoadCopy(MachineFunction& F, MachineInstr *Instr, - std::vector& copies) { - copies.push_back(F.CloneMachineInstr(Instr)); - - MachineInstrBuilder copy(F, copies[0]); - copy->setDesc(InstrInfo->get(getScalarVersion(copy))); - - // Remove the dest, that should be a vector operand. - MachineOperand dest = copy->getOperand(0); - unsigned regnum = dest.getReg(); - - SmallVector scalarRegs = getScalarRegisters(regnum); - copy->RemoveOperand(0); - - std::vector otherOperands; - for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) - otherOperands.push_back(copy->getOperand(i)); - - for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) - copy->RemoveOperand(0); - - for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i) - copy.addReg(scalarRegs[i], RegState::Define); - - for (unsigned i=0, e=otherOperands.size(); i!=e; ++i) - copy.addOperand(otherOperands[i]); - -} - -///============================================================================= -///For a vector store of the form -///stv2 va, [addr] -///the following multi input instruction is created : -///ST v1, v2, [addr] -///Look at NVPTXVector.td for the definitions of multi input stores. -///============================================================================= -void VectorElementize::createStoreCopy(MachineFunction& F, MachineInstr *Instr, - std::vector& copies) { - copies.push_back(F.CloneMachineInstr(Instr)); - - MachineInstrBuilder copy(F, copies[0]); - copy->setDesc(InstrInfo->get(getScalarVersion(copy))); - - MachineOperand src = copy->getOperand(0); - unsigned regnum = src.getReg(); - - SmallVector scalarRegs = getScalarRegisters(regnum); - copy->RemoveOperand(0); - - std::vector otherOperands; - for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) - otherOperands.push_back(copy->getOperand(i)); - - for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) - copy->RemoveOperand(0); - - for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i) - copy.addReg(scalarRegs[i]); - - for (unsigned i=0, e=otherOperands.size(); i!=e; ++i) - copy.addOperand(otherOperands[i]); -} - -///============================================================================= -///va <= shufflev2 vb, vc, , -///gets converted to 2 moves into a1 and a2. The source of the moves depend on -///i1 and i2. i1, i2 can belong to the set {0, 1, 2, 3} for shufflev2. For -///shufflev4 the set is {0,..7}. For example, if i1=3, i2=0, the move -///instructions will be -///a1 <= c2 -///a2 <= b1 -///============================================================================= -void VectorElementize::createVecShuffle(MachineFunction& F, MachineInstr *Instr, - std::vector& copies) { - unsigned numcopies=numCopiesNeeded(Instr); - - unsigned destregnum = Instr->getOperand(0).getReg(); - unsigned src1regnum = Instr->getOperand(1).getReg(); - unsigned src2regnum = Instr->getOperand(2).getReg(); - - SmallVector dest = getScalarRegisters(destregnum); - SmallVector src1 = getScalarRegisters(src1regnum); - SmallVector src2 = getScalarRegisters(src2regnum); - - DebugLoc DL = Instr->getDebugLoc(); - - for (unsigned i=0; iget(getScalarVersion(Instr)), dest[i]); - MachineOperand which=Instr->getOperand(3+i); - assert(which.isImm() && "Shuffle operand not a constant"); - - int src=which.getImm(); - int elem=src%numcopies; - - if (which.getImm() < numcopies) - copy.addReg(src1[elem]); - else - copy.addReg(src2[elem]); - copies.push_back(copy); - } -} - -///============================================================================= -///a <= extractv2 va, -///gets turned into a simple move to the scalar register a. The source depends -///on i1. -///============================================================================= -void VectorElementize::createVecExtract(MachineFunction& F, MachineInstr *Instr, - std::vector& copies) { - unsigned srcregnum = Instr->getOperand(1).getReg(); - - SmallVector src = getScalarRegisters(srcregnum); - - MachineOperand which = Instr->getOperand(2); - assert(which.isImm() && "Extract operand not a constant"); - - DebugLoc DL = Instr->getDebugLoc(); - copies.push_back(BuildMI(F, DL, InstrInfo->get(getScalarVersion(Instr)), - Instr->getOperand(0).getReg()) - .addReg(src[which.getImm()])); -} - -///============================================================================= -///va <= vecinsertv2 vb, c, -///This instruction copies all elements of vb to va, except the 'i1'th element. -///The scalar value c becomes the 'i1'th element of va. -///This gets translated to 2 (4 for vecinsertv4) moves. -///============================================================================= -void VectorElementize::createVecInsert(MachineFunction& F, MachineInstr *Instr, - std::vector& copies) { - unsigned numcopies=numCopiesNeeded(Instr); - - unsigned destregnum = Instr->getOperand(0).getReg(); - unsigned srcregnum = Instr->getOperand(1).getReg(); - - SmallVector dest = getScalarRegisters(destregnum); - SmallVector src = getScalarRegisters(srcregnum); - - MachineOperand which=Instr->getOperand(3); - assert(which.isImm() && "Insert operand not a constant"); - unsigned int elem=which.getImm(); - - DebugLoc DL = Instr->getDebugLoc(); - - for (unsigned i=0; iget(getScalarVersion(Instr)), dest[i]); - - if (i != elem) - copy.addReg(src[i]); - else - copy.addOperand(Instr->getOperand(2)); - - copies.push_back(copy); - } - -} - -///============================================================================= -///va <= buildv2 b1, b2 -///gets translated to -///a1 <= b1 -///a2 <= b2 -///============================================================================= -void VectorElementize::createVecBuild(MachineFunction& F, MachineInstr *Instr, - std::vector& copies) { - unsigned numcopies=numCopiesNeeded(Instr); - - unsigned destregnum = Instr->getOperand(0).getReg(); - - SmallVector dest = getScalarRegisters(destregnum); - - DebugLoc DL = Instr->getDebugLoc(); - - for (unsigned i=0; iget(getScalarVersion(Instr)), - dest[i]) - .addOperand(Instr->getOperand(1+i))); -} - -///============================================================================= -///For a tex inst of the form -///va <= op [scalar operands] -///the following multi output instruction is created : -///[v1, v2] <= op' [scalar operands] -///============================================================================= -void VectorElementize::createVecDest(MachineFunction& F, MachineInstr *Instr, - std::vector& copies) { - copies.push_back(F.CloneMachineInstr(Instr)); - - MachineInstrBuilder copy(F, copies[0]); - copy->setDesc(InstrInfo->get(getScalarVersion(copy))); - - // Remove the dest, that should be a vector operand. - MachineOperand dest = copy->getOperand(0); - unsigned regnum = dest.getReg(); - - SmallVector scalarRegs = getScalarRegisters(regnum); - copy->RemoveOperand(0); - - std::vector otherOperands; - for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) - otherOperands.push_back(copy->getOperand(i)); - - for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i) - copy->RemoveOperand(0); - - for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i) - copy.addReg(scalarRegs[i], RegState::Define); - - for (unsigned i=0, e=otherOperands.size(); i!=e; ++i) - copy.addOperand(otherOperands[i]); -} - -///============================================================================= -///Look at the vector instruction type and dispatch to the createVec<...> -///function that creates the scalar copies. -///============================================================================= -void VectorElementize::createCopies(MachineFunction& F, MachineInstr *Instr, - std::vector& copies) { - if (ISVECLOAD(Instr)) { - createLoadCopy(F, Instr, copies); - return; - } - if (ISVECSTORE(Instr)) { - createStoreCopy(F, Instr, copies); - return; - } - if (ISVECSHUFFLE(Instr)) { - createVecShuffle(F, Instr, copies); - return; - } - if (ISVECEXTRACT(Instr)) { - createVecExtract(F, Instr, copies); - return; - } - if (ISVECINSERT(Instr)) { - createVecInsert(F, Instr, copies); - return; - } - if (ISVECDEST(Instr)) { - createVecDest(F, Instr, copies); - return; - } - if (ISVECBUILD(Instr)) { - createVecBuild(F, Instr, copies); - return; - } - - unsigned numcopies=numCopiesNeeded(Instr); - - for (unsigned i=0; i allOperands; - std::vector isDef; - - for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j) { - MachineOperand oper = copy->getOperand(j); - allOperands.push_back(oper); - if (oper.isReg()) - isDef.push_back(oper.isDef()); - else - isDef.push_back(false); - } - - for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j) - copy->RemoveOperand(0); - - copy->setDesc(InstrInfo->get(getScalarVersion(Instr))); - - for (unsigned j=0, e=allOperands.size(); j!=e; ++j) { - MachineOperand oper=allOperands[j]; - if (oper.isReg()) { - unsigned regnum = oper.getReg(); - if (isVectorRegister(regnum)) { - - SmallVector scalarRegs = getScalarRegisters(regnum); - copy.addReg(scalarRegs[i], getDefRegState(isDef[j])); - } - else - copy.addOperand(oper); - } - else - copy.addOperand(oper); - } - } -} - -///============================================================================= -///Scan through all basic blocks, looking for vector instructions. -///For each vector instruction I, insert the scalar copies before I, and -///add I into toRemove vector. Finally remove all instructions in toRemove. -///============================================================================= -void VectorElementize::elementize(MachineFunction &F) { - for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); - BI!=BE; ++BI) { - MachineBasicBlock *BB = &*BI; - - std::vector copies; - std::vector toRemove; - - for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); - II!=IE; ++II) { - MachineInstr *Instr = &*II; - - if (!isVectorInstr(Instr)) - continue; - - copies.clear(); - createCopies(F, Instr, copies); - for (unsigned i=0, e=copies.size(); i!=e; ++i) - BB->insert(II, copies[i]); - - assert((copies.size() > 0) && "Problem in createCopies"); - toRemove.push_back(Instr); - } - for (unsigned i=0, e=toRemove.size(); i!=e; ++i) - F.DeleteMachineInstr(toRemove[i]->getParent()->remove(toRemove[i])); - } -} - -///============================================================================= -///a <= b -///... -///... -///x <= op(a, ...) -///gets converted to -/// -///x <= op(b, ...) -///The original move is still present. This works on SSA form machine code. -///Note that a <= b should be a simple vreg-to-vreg move instruction. -///TBD : I didn't find a function that can do replaceOperand, so I remove -///all operands and add all of them again, replacing the one while adding. -///============================================================================= -unsigned VectorElementize::copyProp(MachineFunction &F) { - unsigned numReplacements = 0; - - for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE; - ++BI) { - MachineBasicBlock *BB = &*BI; - - for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE; - ++II) { - MachineInstr *Instr = &*II; - - // Don't do copy propagation on PHI as it will cause unnecessary - // live range overlap. - if ((Instr->getOpcode() == TargetOpcode::PHI) || - (Instr->getOpcode() == TargetOpcode::DBG_VALUE)) - continue; - - bool needsReplacement = false; - - for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) { - MachineOperand oper = Instr->getOperand(i); - if (!oper.isReg()) continue; - if (oper.isDef()) continue; - if (!RegInfo->isVirtualRegister(oper.getReg())) continue; - - MachineInstr *defInstr = MRI->getVRegDef(oper.getReg()); - - if (!defInstr) continue; - - if (!isSimpleMove(defInstr)) continue; - - MachineOperand defSrc = defInstr->getOperand(1); - if (!defSrc.isReg()) continue; - if (!RegInfo->isVirtualRegister(defSrc.getReg())) continue; - - needsReplacement = true; - - } - if (!needsReplacement) continue; - - numReplacements++; - - std::vector operands; - - for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) { - MachineOperand oper = Instr->getOperand(i); - bool flag = false; - do { - if (!(oper.isReg())) - break; - if (oper.isDef()) - break; - if (!(RegInfo->isVirtualRegister(oper.getReg()))) - break; - MachineInstr *defInstr = MRI->getVRegDef(oper.getReg()); - if (!(isSimpleMove(defInstr))) - break; - MachineOperand defSrc = defInstr->getOperand(1); - if (!(defSrc.isReg())) - break; - if (!(RegInfo->isVirtualRegister(defSrc.getReg()))) - break; - operands.push_back(defSrc); - flag = true; - } while (0); - if (flag == false) - operands.push_back(oper); - } - - for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) - Instr->RemoveOperand(0); - for (unsigned i=0, e=operands.size(); i!=e; ++i) - Instr->addOperand(F, operands[i]); - - } - } - return numReplacements; -} - -///============================================================================= -///Look for simple vreg-to-vreg instructions whose use_empty() is true, add -///them to deadMoves vector. Then remove all instructions in deadMoves. -///============================================================================= -unsigned VectorElementize::removeDeadMoves(MachineFunction &F) { - std::vector deadMoves; - for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE; - ++BI) { - MachineBasicBlock *BB = &*BI; - - for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE; - ++II) { - MachineInstr *Instr = &*II; - - if (!isSimpleMove(Instr)) continue; - - MachineOperand dest = Instr->getOperand(0); - assert(dest.isReg() && "dest of move not a register"); - assert(RegInfo->isVirtualRegister(dest.getReg()) && - "dest of move not a virtual register"); - - if (MRI->use_empty(dest.getReg())) { - deadMoves.push_back(Instr); - } - } - } - - for (unsigned i=0, e=deadMoves.size(); i!=e; ++i) - F.DeleteMachineInstr(deadMoves[i]->getParent()->remove(deadMoves[i])); - - return deadMoves.size(); -} - -///============================================================================= -///Main function for this pass. -///============================================================================= -bool VectorElementize::runOnMachineFunction(MachineFunction &F) { - MRI = &F.getRegInfo(); - - RegInfo = TM.getRegisterInfo(); - InstrInfo = TM.getInstrInfo(); - - VectorToScalarMap.clear(); - - elementize(F); - - if (RemoveRedundantMoves) - while (1) { - if (copyProp(F) == 0) break; - removeDeadMoves(F); - } - - return true; -} - -FunctionPass *llvm::createVectorElementizePass(NVPTXTargetMachine &tm) { - return new VectorElementize(tm); -} - -unsigned VectorElementize::getScalarVersion(unsigned opcode) { - if (opcode == NVPTX::PHI) - return opcode; - if (opcode == NVPTX::IMPLICIT_DEF) - return opcode; - switch(opcode) { - default: llvm_unreachable("Scalar version not set, fix NVPTXVector.td"); - case TargetOpcode::COPY: return TargetOpcode::COPY; - case NVPTX::AddCCCV2I32: return NVPTX::ADDCCCi32rr; - case NVPTX::AddCCCV4I32: return NVPTX::ADDCCCi32rr; - case NVPTX::AddCCV2I32: return NVPTX::ADDCCi32rr; - case NVPTX::AddCCV4I32: return NVPTX::ADDCCi32rr; - case NVPTX::Build_Vector2_f32: return NVPTX::FMOV32rr; - case NVPTX::Build_Vector2_f64: return NVPTX::FMOV64rr; - case NVPTX::Build_Vector2_i16: return NVPTX::IMOV16rr; - case NVPTX::Build_Vector2_i32: return NVPTX::IMOV32rr; - case NVPTX::Build_Vector2_i64: return NVPTX::IMOV64rr; - case NVPTX::Build_Vector2_i8: return NVPTX::IMOV8rr; - case NVPTX::Build_Vector4_f32: return NVPTX::FMOV32rr; - case NVPTX::Build_Vector4_i16: return NVPTX::IMOV16rr; - case NVPTX::Build_Vector4_i32: return NVPTX::IMOV32rr; - case NVPTX::Build_Vector4_i8: return NVPTX::IMOV8rr; - case NVPTX::CVTv2i16tov2i32: return NVPTX::Zint_extendext16to32; - case NVPTX::CVTv2i64tov2i32: return NVPTX::TRUNC_64to32; - case NVPTX::CVTv2i8tov2i32: return NVPTX::Zint_extendext8to32; - case NVPTX::CVTv4i16tov4i32: return NVPTX::Zint_extendext16to32; - case NVPTX::CVTv4i8tov4i32: return NVPTX::Zint_extendext8to32; - case NVPTX::F32MAD_ftzV2: return NVPTX::FMAD32_ftzrrr; - case NVPTX::F32MADV2: return NVPTX::FMAD32rrr; - case NVPTX::F32MAD_ftzV4: return NVPTX::FMAD32_ftzrrr; - case NVPTX::F32MADV4: return NVPTX::FMAD32rrr; - case NVPTX::F32FMA_ftzV2: return NVPTX::FMA32_ftzrrr; - case NVPTX::F32FMAV2: return NVPTX::FMA32rrr; - case NVPTX::F32FMA_ftzV4: return NVPTX::FMA32_ftzrrr; - case NVPTX::F32FMAV4: return NVPTX::FMA32rrr; - case NVPTX::F64FMAV2: return NVPTX::FMA64rrr; - case NVPTX::FVecEQV2F32: return NVPTX::FSetEQf32rr_toi32; - case NVPTX::FVecEQV2F64: return NVPTX::FSetEQf64rr_toi64; - case NVPTX::FVecEQV4F32: return NVPTX::FSetEQf32rr_toi32; - case NVPTX::FVecGEV2F32: return NVPTX::FSetGEf32rr_toi32; - case NVPTX::FVecGEV2F64: return NVPTX::FSetGEf64rr_toi64; - case NVPTX::FVecGEV4F32: return NVPTX::FSetGEf32rr_toi32; - case NVPTX::FVecGTV2F32: return NVPTX::FSetGTf32rr_toi32; - case NVPTX::FVecGTV2F64: return NVPTX::FSetGTf64rr_toi64; - case NVPTX::FVecGTV4F32: return NVPTX::FSetGTf32rr_toi32; - case NVPTX::FVecLEV2F32: return NVPTX::FSetLEf32rr_toi32; - case NVPTX::FVecLEV2F64: return NVPTX::FSetLEf64rr_toi64; - case NVPTX::FVecLEV4F32: return NVPTX::FSetLEf32rr_toi32; - case NVPTX::FVecLTV2F32: return NVPTX::FSetLTf32rr_toi32; - case NVPTX::FVecLTV2F64: return NVPTX::FSetLTf64rr_toi64; - case NVPTX::FVecLTV4F32: return NVPTX::FSetLTf32rr_toi32; - case NVPTX::FVecNANV2F32: return NVPTX::FSetNANf32rr_toi32; - case NVPTX::FVecNANV2F64: return NVPTX::FSetNANf64rr_toi64; - case NVPTX::FVecNANV4F32: return NVPTX::FSetNANf32rr_toi32; - case NVPTX::FVecNEV2F32: return NVPTX::FSetNEf32rr_toi32; - case NVPTX::FVecNEV2F64: return NVPTX::FSetNEf64rr_toi64; - case NVPTX::FVecNEV4F32: return NVPTX::FSetNEf32rr_toi32; - case NVPTX::FVecNUMV2F32: return NVPTX::FSetNUMf32rr_toi32; - case NVPTX::FVecNUMV2F64: return NVPTX::FSetNUMf64rr_toi64; - case NVPTX::FVecNUMV4F32: return NVPTX::FSetNUMf32rr_toi32; - case NVPTX::FVecUEQV2F32: return NVPTX::FSetUEQf32rr_toi32; - case NVPTX::FVecUEQV2F64: return NVPTX::FSetUEQf64rr_toi64; - case NVPTX::FVecUEQV4F32: return NVPTX::FSetUEQf32rr_toi32; - case NVPTX::FVecUGEV2F32: return NVPTX::FSetUGEf32rr_toi32; - case NVPTX::FVecUGEV2F64: return NVPTX::FSetUGEf64rr_toi64; - case NVPTX::FVecUGEV4F32: return NVPTX::FSetUGEf32rr_toi32; - case NVPTX::FVecUGTV2F32: return NVPTX::FSetUGTf32rr_toi32; - case NVPTX::FVecUGTV2F64: return NVPTX::FSetUGTf64rr_toi64; - case NVPTX::FVecUGTV4F32: return NVPTX::FSetUGTf32rr_toi32; - case NVPTX::FVecULEV2F32: return NVPTX::FSetULEf32rr_toi32; - case NVPTX::FVecULEV2F64: return NVPTX::FSetULEf64rr_toi64; - case NVPTX::FVecULEV4F32: return NVPTX::FSetULEf32rr_toi32; - case NVPTX::FVecULTV2F32: return NVPTX::FSetULTf32rr_toi32; - case NVPTX::FVecULTV2F64: return NVPTX::FSetULTf64rr_toi64; - case NVPTX::FVecULTV4F32: return NVPTX::FSetULTf32rr_toi32; - case NVPTX::FVecUNEV2F32: return NVPTX::FSetUNEf32rr_toi32; - case NVPTX::FVecUNEV2F64: return NVPTX::FSetUNEf64rr_toi64; - case NVPTX::FVecUNEV4F32: return NVPTX::FSetUNEf32rr_toi32; - case NVPTX::I16MADV2: return NVPTX::MAD16rrr; - case NVPTX::I16MADV4: return NVPTX::MAD16rrr; - case NVPTX::I32MADV2: return NVPTX::MAD32rrr; - case NVPTX::I32MADV4: return NVPTX::MAD32rrr; - case NVPTX::I64MADV2: return NVPTX::MAD64rrr; - case NVPTX::I8MADV2: return NVPTX::MAD8rrr; - case NVPTX::I8MADV4: return NVPTX::MAD8rrr; - case NVPTX::ShiftLV2I16: return NVPTX::SHLi16rr; - case NVPTX::ShiftLV2I32: return NVPTX::SHLi32rr; - case NVPTX::ShiftLV2I64: return NVPTX::SHLi64rr; - case NVPTX::ShiftLV2I8: return NVPTX::SHLi8rr; - case NVPTX::ShiftLV4I16: return NVPTX::SHLi16rr; - case NVPTX::ShiftLV4I32: return NVPTX::SHLi32rr; - case NVPTX::ShiftLV4I8: return NVPTX::SHLi8rr; - case NVPTX::ShiftRAV2I16: return NVPTX::SRAi16rr; - case NVPTX::ShiftRAV2I32: return NVPTX::SRAi32rr; - case NVPTX::ShiftRAV2I64: return NVPTX::SRAi64rr; - case NVPTX::ShiftRAV2I8: return NVPTX::SRAi8rr; - case NVPTX::ShiftRAV4I16: return NVPTX::SRAi16rr; - case NVPTX::ShiftRAV4I32: return NVPTX::SRAi32rr; - case NVPTX::ShiftRAV4I8: return NVPTX::SRAi8rr; - case NVPTX::ShiftRLV2I16: return NVPTX::SRLi16rr; - case NVPTX::ShiftRLV2I32: return NVPTX::SRLi32rr; - case NVPTX::ShiftRLV2I64: return NVPTX::SRLi64rr; - case NVPTX::ShiftRLV2I8: return NVPTX::SRLi8rr; - case NVPTX::ShiftRLV4I16: return NVPTX::SRLi16rr; - case NVPTX::ShiftRLV4I32: return NVPTX::SRLi32rr; - case NVPTX::ShiftRLV4I8: return NVPTX::SRLi8rr; - case NVPTX::SubCCCV2I32: return NVPTX::SUBCCCi32rr; - case NVPTX::SubCCCV4I32: return NVPTX::SUBCCCi32rr; - case NVPTX::SubCCV2I32: return NVPTX::SUBCCi32rr; - case NVPTX::SubCCV4I32: return NVPTX::SUBCCi32rr; - case NVPTX::V2F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz; - case NVPTX::V2F32Div_prec: return NVPTX::FDIV32rr_prec; - case NVPTX::V2F32Div_ftz: return NVPTX::FDIV32rr_ftz; - case NVPTX::V2F32Div: return NVPTX::FDIV32rr; - case NVPTX::V2F32_Select: return NVPTX::SELECTf32rr; - case NVPTX::V2F64Div: return NVPTX::FDIV64rr; - case NVPTX::V2F64_Select: return NVPTX::SELECTf64rr; - case NVPTX::V2I16_Select: return NVPTX::SELECTi16rr; - case NVPTX::V2I32_Select: return NVPTX::SELECTi32rr; - case NVPTX::V2I64_Select: return NVPTX::SELECTi64rr; - case NVPTX::V2I8_Select: return NVPTX::SELECTi8rr; - case NVPTX::V2f32Extract: return NVPTX::FMOV32rr; - case NVPTX::V2f32Insert: return NVPTX::FMOV32rr; - case NVPTX::V2f32Mov: return NVPTX::FMOV32rr; - case NVPTX::V2f64Extract: return NVPTX::FMOV64rr; - case NVPTX::V2f64Insert: return NVPTX::FMOV64rr; - case NVPTX::V2f64Mov: return NVPTX::FMOV64rr; - case NVPTX::V2i16Extract: return NVPTX::IMOV16rr; - case NVPTX::V2i16Insert: return NVPTX::IMOV16rr; - case NVPTX::V2i16Mov: return NVPTX::IMOV16rr; - case NVPTX::V2i32Extract: return NVPTX::IMOV32rr; - case NVPTX::V2i32Insert: return NVPTX::IMOV32rr; - case NVPTX::V2i32Mov: return NVPTX::IMOV32rr; - case NVPTX::V2i64Extract: return NVPTX::IMOV64rr; - case NVPTX::V2i64Insert: return NVPTX::IMOV64rr; - case NVPTX::V2i64Mov: return NVPTX::IMOV64rr; - case NVPTX::V2i8Extract: return NVPTX::IMOV8rr; - case NVPTX::V2i8Insert: return NVPTX::IMOV8rr; - case NVPTX::V2i8Mov: return NVPTX::IMOV8rr; - case NVPTX::V4F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz; - case NVPTX::V4F32Div_prec: return NVPTX::FDIV32rr_prec; - case NVPTX::V4F32Div_ftz: return NVPTX::FDIV32rr_ftz; - case NVPTX::V4F32Div: return NVPTX::FDIV32rr; - case NVPTX::V4F32_Select: return NVPTX::SELECTf32rr; - case NVPTX::V4I16_Select: return NVPTX::SELECTi16rr; - case NVPTX::V4I32_Select: return NVPTX::SELECTi32rr; - case NVPTX::V4I8_Select: return NVPTX::SELECTi8rr; - case NVPTX::V4f32Extract: return NVPTX::FMOV32rr; - case NVPTX::V4f32Insert: return NVPTX::FMOV32rr; - case NVPTX::V4f32Mov: return NVPTX::FMOV32rr; - case NVPTX::V4i16Extract: return NVPTX::IMOV16rr; - case NVPTX::V4i16Insert: return NVPTX::IMOV16rr; - case NVPTX::V4i16Mov: return NVPTX::IMOV16rr; - case NVPTX::V4i32Extract: return NVPTX::IMOV32rr; - case NVPTX::V4i32Insert: return NVPTX::IMOV32rr; - case NVPTX::V4i32Mov: return NVPTX::IMOV32rr; - case NVPTX::V4i8Extract: return NVPTX::IMOV8rr; - case NVPTX::V4i8Insert: return NVPTX::IMOV8rr; - case NVPTX::V4i8Mov: return NVPTX::IMOV8rr; - case NVPTX::VAddV2I16: return NVPTX::ADDi16rr; - case NVPTX::VAddV2I32: return NVPTX::ADDi32rr; - case NVPTX::VAddV2I64: return NVPTX::ADDi64rr; - case NVPTX::VAddV2I8: return NVPTX::ADDi8rr; - case NVPTX::VAddV4I16: return NVPTX::ADDi16rr; - case NVPTX::VAddV4I32: return NVPTX::ADDi32rr; - case NVPTX::VAddV4I8: return NVPTX::ADDi8rr; - case NVPTX::VAddfV2F32: return NVPTX::FADDf32rr; - case NVPTX::VAddfV2F32_ftz: return NVPTX::FADDf32rr_ftz; - case NVPTX::VAddfV2F64: return NVPTX::FADDf64rr; - case NVPTX::VAddfV4F32: return NVPTX::FADDf32rr; - case NVPTX::VAddfV4F32_ftz: return NVPTX::FADDf32rr_ftz; - case NVPTX::VAndV2I16: return NVPTX::ANDb16rr; - case NVPTX::VAndV2I32: return NVPTX::ANDb32rr; - case NVPTX::VAndV2I64: return NVPTX::ANDb64rr; - case NVPTX::VAndV2I8: return NVPTX::ANDb8rr; - case NVPTX::VAndV4I16: return NVPTX::ANDb16rr; - case NVPTX::VAndV4I32: return NVPTX::ANDb32rr; - case NVPTX::VAndV4I8: return NVPTX::ANDb8rr; - case NVPTX::VMulfV2F32_ftz: return NVPTX::FMULf32rr_ftz; - case NVPTX::VMulfV2F32: return NVPTX::FMULf32rr; - case NVPTX::VMulfV2F64: return NVPTX::FMULf64rr; - case NVPTX::VMulfV4F32_ftz: return NVPTX::FMULf32rr_ftz; - case NVPTX::VMulfV4F32: return NVPTX::FMULf32rr; - case NVPTX::VMultHSV2I16: return NVPTX::MULTHSi16rr; - case NVPTX::VMultHSV2I32: return NVPTX::MULTHSi32rr; - case NVPTX::VMultHSV2I64: return NVPTX::MULTHSi64rr; - case NVPTX::VMultHSV2I8: return NVPTX::MULTHSi8rr; - case NVPTX::VMultHSV4I16: return NVPTX::MULTHSi16rr; - case NVPTX::VMultHSV4I32: return NVPTX::MULTHSi32rr; - case NVPTX::VMultHSV4I8: return NVPTX::MULTHSi8rr; - case NVPTX::VMultHUV2I16: return NVPTX::MULTHUi16rr; - case NVPTX::VMultHUV2I32: return NVPTX::MULTHUi32rr; - case NVPTX::VMultHUV2I64: return NVPTX::MULTHUi64rr; - case NVPTX::VMultHUV2I8: return NVPTX::MULTHUi8rr; - case NVPTX::VMultHUV4I16: return NVPTX::MULTHUi16rr; - case NVPTX::VMultHUV4I32: return NVPTX::MULTHUi32rr; - case NVPTX::VMultHUV4I8: return NVPTX::MULTHUi8rr; - case NVPTX::VMultV2I16: return NVPTX::MULTi16rr; - case NVPTX::VMultV2I32: return NVPTX::MULTi32rr; - case NVPTX::VMultV2I64: return NVPTX::MULTi64rr; - case NVPTX::VMultV2I8: return NVPTX::MULTi8rr; - case NVPTX::VMultV4I16: return NVPTX::MULTi16rr; - case NVPTX::VMultV4I32: return NVPTX::MULTi32rr; - case NVPTX::VMultV4I8: return NVPTX::MULTi8rr; - case NVPTX::VNegV2I16: return NVPTX::INEG16; - case NVPTX::VNegV2I32: return NVPTX::INEG32; - case NVPTX::VNegV2I64: return NVPTX::INEG64; - case NVPTX::VNegV2I8: return NVPTX::INEG8; - case NVPTX::VNegV4I16: return NVPTX::INEG16; - case NVPTX::VNegV4I32: return NVPTX::INEG32; - case NVPTX::VNegV4I8: return NVPTX::INEG8; - case NVPTX::VNegv2f32: return NVPTX::FNEGf32; - case NVPTX::VNegv2f32_ftz: return NVPTX::FNEGf32_ftz; - case NVPTX::VNegv2f64: return NVPTX::FNEGf64; - case NVPTX::VNegv4f32: return NVPTX::FNEGf32; - case NVPTX::VNegv4f32_ftz: return NVPTX::FNEGf32_ftz; - case NVPTX::VNotV2I16: return NVPTX::NOT16; - case NVPTX::VNotV2I32: return NVPTX::NOT32; - case NVPTX::VNotV2I64: return NVPTX::NOT64; - case NVPTX::VNotV2I8: return NVPTX::NOT8; - case NVPTX::VNotV4I16: return NVPTX::NOT16; - case NVPTX::VNotV4I32: return NVPTX::NOT32; - case NVPTX::VNotV4I8: return NVPTX::NOT8; - case NVPTX::VOrV2I16: return NVPTX::ORb16rr; - case NVPTX::VOrV2I32: return NVPTX::ORb32rr; - case NVPTX::VOrV2I64: return NVPTX::ORb64rr; - case NVPTX::VOrV2I8: return NVPTX::ORb8rr; - case NVPTX::VOrV4I16: return NVPTX::ORb16rr; - case NVPTX::VOrV4I32: return NVPTX::ORb32rr; - case NVPTX::VOrV4I8: return NVPTX::ORb8rr; - case NVPTX::VSDivV2I16: return NVPTX::SDIVi16rr; - case NVPTX::VSDivV2I32: return NVPTX::SDIVi32rr; - case NVPTX::VSDivV2I64: return NVPTX::SDIVi64rr; - case NVPTX::VSDivV2I8: return NVPTX::SDIVi8rr; - case NVPTX::VSDivV4I16: return NVPTX::SDIVi16rr; - case NVPTX::VSDivV4I32: return NVPTX::SDIVi32rr; - case NVPTX::VSDivV4I8: return NVPTX::SDIVi8rr; - case NVPTX::VSRemV2I16: return NVPTX::SREMi16rr; - case NVPTX::VSRemV2I32: return NVPTX::SREMi32rr; - case NVPTX::VSRemV2I64: return NVPTX::SREMi64rr; - case NVPTX::VSRemV2I8: return NVPTX::SREMi8rr; - case NVPTX::VSRemV4I16: return NVPTX::SREMi16rr; - case NVPTX::VSRemV4I32: return NVPTX::SREMi32rr; - case NVPTX::VSRemV4I8: return NVPTX::SREMi8rr; - case NVPTX::VSubV2I16: return NVPTX::SUBi16rr; - case NVPTX::VSubV2I32: return NVPTX::SUBi32rr; - case NVPTX::VSubV2I64: return NVPTX::SUBi64rr; - case NVPTX::VSubV2I8: return NVPTX::SUBi8rr; - case NVPTX::VSubV4I16: return NVPTX::SUBi16rr; - case NVPTX::VSubV4I32: return NVPTX::SUBi32rr; - case NVPTX::VSubV4I8: return NVPTX::SUBi8rr; - case NVPTX::VSubfV2F32_ftz: return NVPTX::FSUBf32rr_ftz; - case NVPTX::VSubfV2F32: return NVPTX::FSUBf32rr; - case NVPTX::VSubfV2F64: return NVPTX::FSUBf64rr; - case NVPTX::VSubfV4F32_ftz: return NVPTX::FSUBf32rr_ftz; - case NVPTX::VSubfV4F32: return NVPTX::FSUBf32rr; - case NVPTX::VUDivV2I16: return NVPTX::UDIVi16rr; - case NVPTX::VUDivV2I32: return NVPTX::UDIVi32rr; - case NVPTX::VUDivV2I64: return NVPTX::UDIVi64rr; - case NVPTX::VUDivV2I8: return NVPTX::UDIVi8rr; - case NVPTX::VUDivV4I16: return NVPTX::UDIVi16rr; - case NVPTX::VUDivV4I32: return NVPTX::UDIVi32rr; - case NVPTX::VUDivV4I8: return NVPTX::UDIVi8rr; - case NVPTX::VURemV2I16: return NVPTX::UREMi16rr; - case NVPTX::VURemV2I32: return NVPTX::UREMi32rr; - case NVPTX::VURemV2I64: return NVPTX::UREMi64rr; - case NVPTX::VURemV2I8: return NVPTX::UREMi8rr; - case NVPTX::VURemV4I16: return NVPTX::UREMi16rr; - case NVPTX::VURemV4I32: return NVPTX::UREMi32rr; - case NVPTX::VURemV4I8: return NVPTX::UREMi8rr; - case NVPTX::VXorV2I16: return NVPTX::XORb16rr; - case NVPTX::VXorV2I32: return NVPTX::XORb32rr; - case NVPTX::VXorV2I64: return NVPTX::XORb64rr; - case NVPTX::VXorV2I8: return NVPTX::XORb8rr; - case NVPTX::VXorV4I16: return NVPTX::XORb16rr; - case NVPTX::VXorV4I32: return NVPTX::XORb32rr; - case NVPTX::VXorV4I8: return NVPTX::XORb8rr; - case NVPTX::VecSEQV2I16: return NVPTX::ISetSEQi16rr_toi16; - case NVPTX::VecSEQV2I32: return NVPTX::ISetSEQi32rr_toi32; - case NVPTX::VecSEQV2I64: return NVPTX::ISetSEQi64rr_toi64; - case NVPTX::VecSEQV2I8: return NVPTX::ISetSEQi8rr_toi8; - case NVPTX::VecSEQV4I16: return NVPTX::ISetSEQi16rr_toi16; - case NVPTX::VecSEQV4I32: return NVPTX::ISetSEQi32rr_toi32; - case NVPTX::VecSEQV4I8: return NVPTX::ISetSEQi8rr_toi8; - case NVPTX::VecSGEV2I16: return NVPTX::ISetSGEi16rr_toi16; - case NVPTX::VecSGEV2I32: return NVPTX::ISetSGEi32rr_toi32; - case NVPTX::VecSGEV2I64: return NVPTX::ISetSGEi64rr_toi64; - case NVPTX::VecSGEV2I8: return NVPTX::ISetSGEi8rr_toi8; - case NVPTX::VecSGEV4I16: return NVPTX::ISetSGEi16rr_toi16; - case NVPTX::VecSGEV4I32: return NVPTX::ISetSGEi32rr_toi32; - case NVPTX::VecSGEV4I8: return NVPTX::ISetSGEi8rr_toi8; - case NVPTX::VecSGTV2I16: return NVPTX::ISetSGTi16rr_toi16; - case NVPTX::VecSGTV2I32: return NVPTX::ISetSGTi32rr_toi32; - case NVPTX::VecSGTV2I64: return NVPTX::ISetSGTi64rr_toi64; - case NVPTX::VecSGTV2I8: return NVPTX::ISetSGTi8rr_toi8; - case NVPTX::VecSGTV4I16: return NVPTX::ISetSGTi16rr_toi16; - case NVPTX::VecSGTV4I32: return NVPTX::ISetSGTi32rr_toi32; - case NVPTX::VecSGTV4I8: return NVPTX::ISetSGTi8rr_toi8; - case NVPTX::VecSLEV2I16: return NVPTX::ISetSLEi16rr_toi16; - case NVPTX::VecSLEV2I32: return NVPTX::ISetSLEi32rr_toi32; - case NVPTX::VecSLEV2I64: return NVPTX::ISetSLEi64rr_toi64; - case NVPTX::VecSLEV2I8: return NVPTX::ISetSLEi8rr_toi8; - case NVPTX::VecSLEV4I16: return NVPTX::ISetSLEi16rr_toi16; - case NVPTX::VecSLEV4I32: return NVPTX::ISetSLEi32rr_toi32; - case NVPTX::VecSLEV4I8: return NVPTX::ISetSLEi8rr_toi8; - case NVPTX::VecSLTV2I16: return NVPTX::ISetSLTi16rr_toi16; - case NVPTX::VecSLTV2I32: return NVPTX::ISetSLTi32rr_toi32; - case NVPTX::VecSLTV2I64: return NVPTX::ISetSLTi64rr_toi64; - case NVPTX::VecSLTV2I8: return NVPTX::ISetSLTi8rr_toi8; - case NVPTX::VecSLTV4I16: return NVPTX::ISetSLTi16rr_toi16; - case NVPTX::VecSLTV4I32: return NVPTX::ISetSLTi32rr_toi32; - case NVPTX::VecSLTV4I8: return NVPTX::ISetSLTi8rr_toi8; - case NVPTX::VecSNEV2I16: return NVPTX::ISetSNEi16rr_toi16; - case NVPTX::VecSNEV2I32: return NVPTX::ISetSNEi32rr_toi32; - case NVPTX::VecSNEV2I64: return NVPTX::ISetSNEi64rr_toi64; - case NVPTX::VecSNEV2I8: return NVPTX::ISetSNEi8rr_toi8; - case NVPTX::VecSNEV4I16: return NVPTX::ISetSNEi16rr_toi16; - case NVPTX::VecSNEV4I32: return NVPTX::ISetSNEi32rr_toi32; - case NVPTX::VecSNEV4I8: return NVPTX::ISetSNEi8rr_toi8; - case NVPTX::VecShuffle_v2f32: return NVPTX::FMOV32rr; - case NVPTX::VecShuffle_v2f64: return NVPTX::FMOV64rr; - case NVPTX::VecShuffle_v2i16: return NVPTX::IMOV16rr; - case NVPTX::VecShuffle_v2i32: return NVPTX::IMOV32rr; - case NVPTX::VecShuffle_v2i64: return NVPTX::IMOV64rr; - case NVPTX::VecShuffle_v2i8: return NVPTX::IMOV8rr; - case NVPTX::VecShuffle_v4f32: return NVPTX::FMOV32rr; - case NVPTX::VecShuffle_v4i16: return NVPTX::IMOV16rr; - case NVPTX::VecShuffle_v4i32: return NVPTX::IMOV32rr; - case NVPTX::VecShuffle_v4i8: return NVPTX::IMOV8rr; - case NVPTX::VecUEQV2I16: return NVPTX::ISetUEQi16rr_toi16; - case NVPTX::VecUEQV2I32: return NVPTX::ISetUEQi32rr_toi32; - case NVPTX::VecUEQV2I64: return NVPTX::ISetUEQi64rr_toi64; - case NVPTX::VecUEQV2I8: return NVPTX::ISetUEQi8rr_toi8; - case NVPTX::VecUEQV4I16: return NVPTX::ISetUEQi16rr_toi16; - case NVPTX::VecUEQV4I32: return NVPTX::ISetUEQi32rr_toi32; - case NVPTX::VecUEQV4I8: return NVPTX::ISetUEQi8rr_toi8; - case NVPTX::VecUGEV2I16: return NVPTX::ISetUGEi16rr_toi16; - case NVPTX::VecUGEV2I32: return NVPTX::ISetUGEi32rr_toi32; - case NVPTX::VecUGEV2I64: return NVPTX::ISetUGEi64rr_toi64; - case NVPTX::VecUGEV2I8: return NVPTX::ISetUGEi8rr_toi8; - case NVPTX::VecUGEV4I16: return NVPTX::ISetUGEi16rr_toi16; - case NVPTX::VecUGEV4I32: return NVPTX::ISetUGEi32rr_toi32; - case NVPTX::VecUGEV4I8: return NVPTX::ISetUGEi8rr_toi8; - case NVPTX::VecUGTV2I16: return NVPTX::ISetUGTi16rr_toi16; - case NVPTX::VecUGTV2I32: return NVPTX::ISetUGTi32rr_toi32; - case NVPTX::VecUGTV2I64: return NVPTX::ISetUGTi64rr_toi64; - case NVPTX::VecUGTV2I8: return NVPTX::ISetUGTi8rr_toi8; - case NVPTX::VecUGTV4I16: return NVPTX::ISetUGTi16rr_toi16; - case NVPTX::VecUGTV4I32: return NVPTX::ISetUGTi32rr_toi32; - case NVPTX::VecUGTV4I8: return NVPTX::ISetUGTi8rr_toi8; - case NVPTX::VecULEV2I16: return NVPTX::ISetULEi16rr_toi16; - case NVPTX::VecULEV2I32: return NVPTX::ISetULEi32rr_toi32; - case NVPTX::VecULEV2I64: return NVPTX::ISetULEi64rr_toi64; - case NVPTX::VecULEV2I8: return NVPTX::ISetULEi8rr_toi8; - case NVPTX::VecULEV4I16: return NVPTX::ISetULEi16rr_toi16; - case NVPTX::VecULEV4I32: return NVPTX::ISetULEi32rr_toi32; - case NVPTX::VecULEV4I8: return NVPTX::ISetULEi8rr_toi8; - case NVPTX::VecULTV2I16: return NVPTX::ISetULTi16rr_toi16; - case NVPTX::VecULTV2I32: return NVPTX::ISetULTi32rr_toi32; - case NVPTX::VecULTV2I64: return NVPTX::ISetULTi64rr_toi64; - case NVPTX::VecULTV2I8: return NVPTX::ISetULTi8rr_toi8; - case NVPTX::VecULTV4I16: return NVPTX::ISetULTi16rr_toi16; - case NVPTX::VecULTV4I32: return NVPTX::ISetULTi32rr_toi32; - case NVPTX::VecULTV4I8: return NVPTX::ISetULTi8rr_toi8; - case NVPTX::VecUNEV2I16: return NVPTX::ISetUNEi16rr_toi16; - case NVPTX::VecUNEV2I32: return NVPTX::ISetUNEi32rr_toi32; - case NVPTX::VecUNEV2I64: return NVPTX::ISetUNEi64rr_toi64; - case NVPTX::VecUNEV2I8: return NVPTX::ISetUNEi8rr_toi8; - case NVPTX::VecUNEV4I16: return NVPTX::ISetUNEi16rr_toi16; - case NVPTX::VecUNEV4I32: return NVPTX::ISetUNEi32rr_toi32; - case NVPTX::VecUNEV4I8: return NVPTX::ISetUNEi8rr_toi8; - case NVPTX::INT_PTX_LDU_G_v2i8_32: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_32; - case NVPTX::INT_PTX_LDU_G_v4i8_32: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_32; - case NVPTX::INT_PTX_LDU_G_v2i16_32: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_32; - case NVPTX::INT_PTX_LDU_G_v4i16_32: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_32; - case NVPTX::INT_PTX_LDU_G_v2i32_32: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_32; - case NVPTX::INT_PTX_LDU_G_v4i32_32: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_32; - case NVPTX::INT_PTX_LDU_G_v2f32_32: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_32; - case NVPTX::INT_PTX_LDU_G_v4f32_32: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_32; - case NVPTX::INT_PTX_LDU_G_v2i64_32: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_32; - case NVPTX::INT_PTX_LDU_G_v2f64_32: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_32; - case NVPTX::INT_PTX_LDU_G_v2i8_64: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_64; - case NVPTX::INT_PTX_LDU_G_v4i8_64: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_64; - case NVPTX::INT_PTX_LDU_G_v2i16_64: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_64; - case NVPTX::INT_PTX_LDU_G_v4i16_64: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_64; - case NVPTX::INT_PTX_LDU_G_v2i32_64: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_64; - case NVPTX::INT_PTX_LDU_G_v4i32_64: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_64; - case NVPTX::INT_PTX_LDU_G_v2f32_64: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_64; - case NVPTX::INT_PTX_LDU_G_v4f32_64: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_64; - case NVPTX::INT_PTX_LDU_G_v2i64_64: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_64; - case NVPTX::INT_PTX_LDU_G_v2f64_64: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_64; - - case NVPTX::LoadParamV4I32: return NVPTX::LoadParamScalar4I32; - case NVPTX::LoadParamV4I16: return NVPTX::LoadParamScalar4I16; - case NVPTX::LoadParamV4I8: return NVPTX::LoadParamScalar4I8; - case NVPTX::LoadParamV2I64: return NVPTX::LoadParamScalar2I64; - case NVPTX::LoadParamV2I32: return NVPTX::LoadParamScalar2I32; - case NVPTX::LoadParamV2I16: return NVPTX::LoadParamScalar2I16; - case NVPTX::LoadParamV2I8: return NVPTX::LoadParamScalar2I8; - case NVPTX::LoadParamV4F32: return NVPTX::LoadParamScalar4F32; - case NVPTX::LoadParamV2F32: return NVPTX::LoadParamScalar2F32; - case NVPTX::LoadParamV2F64: return NVPTX::LoadParamScalar2F64; - case NVPTX::StoreParamV4I32: return NVPTX::StoreParamScalar4I32; - case NVPTX::StoreParamV4I16: return NVPTX::StoreParamScalar4I16; - case NVPTX::StoreParamV4I8: return NVPTX::StoreParamScalar4I8; - case NVPTX::StoreParamV2I64: return NVPTX::StoreParamScalar2I64; - case NVPTX::StoreParamV2I32: return NVPTX::StoreParamScalar2I32; - case NVPTX::StoreParamV2I16: return NVPTX::StoreParamScalar2I16; - case NVPTX::StoreParamV2I8: return NVPTX::StoreParamScalar2I8; - case NVPTX::StoreParamV4F32: return NVPTX::StoreParamScalar4F32; - case NVPTX::StoreParamV2F32: return NVPTX::StoreParamScalar2F32; - case NVPTX::StoreParamV2F64: return NVPTX::StoreParamScalar2F64; - case NVPTX::StoreRetvalV4I32: return NVPTX::StoreRetvalScalar4I32; - case NVPTX::StoreRetvalV4I16: return NVPTX::StoreRetvalScalar4I16; - case NVPTX::StoreRetvalV4I8: return NVPTX::StoreRetvalScalar4I8; - case NVPTX::StoreRetvalV2I64: return NVPTX::StoreRetvalScalar2I64; - case NVPTX::StoreRetvalV2I32: return NVPTX::StoreRetvalScalar2I32; - case NVPTX::StoreRetvalV2I16: return NVPTX::StoreRetvalScalar2I16; - case NVPTX::StoreRetvalV2I8: return NVPTX::StoreRetvalScalar2I8; - case NVPTX::StoreRetvalV4F32: return NVPTX::StoreRetvalScalar4F32; - case NVPTX::StoreRetvalV2F32: return NVPTX::StoreRetvalScalar2F32; - case NVPTX::StoreRetvalV2F64: return NVPTX::StoreRetvalScalar2F64; - case NVPTX::VecI32toV4I8: return NVPTX::I32toV4I8; - case NVPTX::VecI64toV4I16: return NVPTX::I64toV4I16; - case NVPTX::VecI16toV2I8: return NVPTX::I16toV2I8; - case NVPTX::VecI32toV2I16: return NVPTX::I32toV2I16; - case NVPTX::VecI64toV2I32: return NVPTX::I64toV2I32; - case NVPTX::VecF64toV2F32: return NVPTX::F64toV2F32; - - case NVPTX::LD_v2i8_avar: return NVPTX::LDV_i8_v2_avar; - case NVPTX::LD_v2i8_areg: return NVPTX::LDV_i8_v2_areg; - case NVPTX::LD_v2i8_ari: return NVPTX::LDV_i8_v2_ari; - case NVPTX::LD_v2i8_asi: return NVPTX::LDV_i8_v2_asi; - case NVPTX::LD_v4i8_avar: return NVPTX::LDV_i8_v4_avar; - case NVPTX::LD_v4i8_areg: return NVPTX::LDV_i8_v4_areg; - case NVPTX::LD_v4i8_ari: return NVPTX::LDV_i8_v4_ari; - case NVPTX::LD_v4i8_asi: return NVPTX::LDV_i8_v4_asi; - - case NVPTX::LD_v2i16_avar: return NVPTX::LDV_i16_v2_avar; - case NVPTX::LD_v2i16_areg: return NVPTX::LDV_i16_v2_areg; - case NVPTX::LD_v2i16_ari: return NVPTX::LDV_i16_v2_ari; - case NVPTX::LD_v2i16_asi: return NVPTX::LDV_i16_v2_asi; - case NVPTX::LD_v4i16_avar: return NVPTX::LDV_i16_v4_avar; - case NVPTX::LD_v4i16_areg: return NVPTX::LDV_i16_v4_areg; - case NVPTX::LD_v4i16_ari: return NVPTX::LDV_i16_v4_ari; - case NVPTX::LD_v4i16_asi: return NVPTX::LDV_i16_v4_asi; - - case NVPTX::LD_v2i32_avar: return NVPTX::LDV_i32_v2_avar; - case NVPTX::LD_v2i32_areg: return NVPTX::LDV_i32_v2_areg; - case NVPTX::LD_v2i32_ari: return NVPTX::LDV_i32_v2_ari; - case NVPTX::LD_v2i32_asi: return NVPTX::LDV_i32_v2_asi; - case NVPTX::LD_v4i32_avar: return NVPTX::LDV_i32_v4_avar; - case NVPTX::LD_v4i32_areg: return NVPTX::LDV_i32_v4_areg; - case NVPTX::LD_v4i32_ari: return NVPTX::LDV_i32_v4_ari; - case NVPTX::LD_v4i32_asi: return NVPTX::LDV_i32_v4_asi; - - case NVPTX::LD_v2f32_avar: return NVPTX::LDV_f32_v2_avar; - case NVPTX::LD_v2f32_areg: return NVPTX::LDV_f32_v2_areg; - case NVPTX::LD_v2f32_ari: return NVPTX::LDV_f32_v2_ari; - case NVPTX::LD_v2f32_asi: return NVPTX::LDV_f32_v2_asi; - case NVPTX::LD_v4f32_avar: return NVPTX::LDV_f32_v4_avar; - case NVPTX::LD_v4f32_areg: return NVPTX::LDV_f32_v4_areg; - case NVPTX::LD_v4f32_ari: return NVPTX::LDV_f32_v4_ari; - case NVPTX::LD_v4f32_asi: return NVPTX::LDV_f32_v4_asi; - - case NVPTX::LD_v2i64_avar: return NVPTX::LDV_i64_v2_avar; - case NVPTX::LD_v2i64_areg: return NVPTX::LDV_i64_v2_areg; - case NVPTX::LD_v2i64_ari: return NVPTX::LDV_i64_v2_ari; - case NVPTX::LD_v2i64_asi: return NVPTX::LDV_i64_v2_asi; - case NVPTX::LD_v2f64_avar: return NVPTX::LDV_f64_v2_avar; - case NVPTX::LD_v2f64_areg: return NVPTX::LDV_f64_v2_areg; - case NVPTX::LD_v2f64_ari: return NVPTX::LDV_f64_v2_ari; - case NVPTX::LD_v2f64_asi: return NVPTX::LDV_f64_v2_asi; - - case NVPTX::ST_v2i8_avar: return NVPTX::STV_i8_v2_avar; - case NVPTX::ST_v2i8_areg: return NVPTX::STV_i8_v2_areg; - case NVPTX::ST_v2i8_ari: return NVPTX::STV_i8_v2_ari; - case NVPTX::ST_v2i8_asi: return NVPTX::STV_i8_v2_asi; - case NVPTX::ST_v4i8_avar: return NVPTX::STV_i8_v4_avar; - case NVPTX::ST_v4i8_areg: return NVPTX::STV_i8_v4_areg; - case NVPTX::ST_v4i8_ari: return NVPTX::STV_i8_v4_ari; - case NVPTX::ST_v4i8_asi: return NVPTX::STV_i8_v4_asi; - - case NVPTX::ST_v2i16_avar: return NVPTX::STV_i16_v2_avar; - case NVPTX::ST_v2i16_areg: return NVPTX::STV_i16_v2_areg; - case NVPTX::ST_v2i16_ari: return NVPTX::STV_i16_v2_ari; - case NVPTX::ST_v2i16_asi: return NVPTX::STV_i16_v2_asi; - case NVPTX::ST_v4i16_avar: return NVPTX::STV_i16_v4_avar; - case NVPTX::ST_v4i16_areg: return NVPTX::STV_i16_v4_areg; - case NVPTX::ST_v4i16_ari: return NVPTX::STV_i16_v4_ari; - case NVPTX::ST_v4i16_asi: return NVPTX::STV_i16_v4_asi; - - case NVPTX::ST_v2i32_avar: return NVPTX::STV_i32_v2_avar; - case NVPTX::ST_v2i32_areg: return NVPTX::STV_i32_v2_areg; - case NVPTX::ST_v2i32_ari: return NVPTX::STV_i32_v2_ari; - case NVPTX::ST_v2i32_asi: return NVPTX::STV_i32_v2_asi; - case NVPTX::ST_v4i32_avar: return NVPTX::STV_i32_v4_avar; - case NVPTX::ST_v4i32_areg: return NVPTX::STV_i32_v4_areg; - case NVPTX::ST_v4i32_ari: return NVPTX::STV_i32_v4_ari; - case NVPTX::ST_v4i32_asi: return NVPTX::STV_i32_v4_asi; - - case NVPTX::ST_v2f32_avar: return NVPTX::STV_f32_v2_avar; - case NVPTX::ST_v2f32_areg: return NVPTX::STV_f32_v2_areg; - case NVPTX::ST_v2f32_ari: return NVPTX::STV_f32_v2_ari; - case NVPTX::ST_v2f32_asi: return NVPTX::STV_f32_v2_asi; - case NVPTX::ST_v4f32_avar: return NVPTX::STV_f32_v4_avar; - case NVPTX::ST_v4f32_areg: return NVPTX::STV_f32_v4_areg; - case NVPTX::ST_v4f32_ari: return NVPTX::STV_f32_v4_ari; - case NVPTX::ST_v4f32_asi: return NVPTX::STV_f32_v4_asi; - - case NVPTX::ST_v2i64_avar: return NVPTX::STV_i64_v2_avar; - case NVPTX::ST_v2i64_areg: return NVPTX::STV_i64_v2_areg; - case NVPTX::ST_v2i64_ari: return NVPTX::STV_i64_v2_ari; - case NVPTX::ST_v2i64_asi: return NVPTX::STV_i64_v2_asi; - case NVPTX::ST_v2f64_avar: return NVPTX::STV_f64_v2_avar; - case NVPTX::ST_v2f64_areg: return NVPTX::STV_f64_v2_areg; - case NVPTX::ST_v2f64_ari: return NVPTX::STV_f64_v2_ari; - case NVPTX::ST_v2f64_asi: return NVPTX::STV_f64_v2_asi; - } - return 0; -} diff --git a/lib/Target/NVPTX/gen-register-defs.py b/lib/Target/NVPTX/gen-register-defs.py deleted file mode 100644 index ed066682312..00000000000 --- a/lib/Target/NVPTX/gen-register-defs.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/env python - -num_regs = 396 - -outFile = open('NVPTXRegisterInfo.td', 'w') - -outFile.write(''' -//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Declarations that describe the PTX register file -//===----------------------------------------------------------------------===// - -class NVPTXReg : Register { - let Namespace = "NVPTX"; -} - -class NVPTXRegClass regTypes, int alignment, dag regList> - : RegisterClass <"NVPTX", regTypes, alignment, regList>; - -//===----------------------------------------------------------------------===// -// Registers -//===----------------------------------------------------------------------===// - -// Special Registers used as stack pointer -def VRFrame : NVPTXReg<"%SP">; -def VRFrameLocal : NVPTXReg<"%SPL">; - -// Special Registers used as the stack -def VRDepot : NVPTXReg<"%Depot">; -''') - -# Predicates -outFile.write(''' -//===--- Predicate --------------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def P%d : NVPTXReg<"%%p%d">;\n' % (i, i)) - -# Int8 -outFile.write(''' -//===--- 8-bit ------------------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def RC%d : NVPTXReg<"%%rc%d">;\n' % (i, i)) - -# Int16 -outFile.write(''' -//===--- 16-bit -----------------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def RS%d : NVPTXReg<"%%rs%d">;\n' % (i, i)) - -# Int32 -outFile.write(''' -//===--- 32-bit -----------------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def R%d : NVPTXReg<"%%r%d">;\n' % (i, i)) - -# Int64 -outFile.write(''' -//===--- 64-bit -----------------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def RL%d : NVPTXReg<"%%rl%d">;\n' % (i, i)) - -# F32 -outFile.write(''' -//===--- 32-bit float -----------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def F%d : NVPTXReg<"%%f%d">;\n' % (i, i)) - -# F64 -outFile.write(''' -//===--- 64-bit float -----------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def FL%d : NVPTXReg<"%%fl%d">;\n' % (i, i)) - -# Vector registers -outFile.write(''' -//===--- Vector -----------------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def v2b8_%d : NVPTXReg<"%%v2b8_%d">;\n' % (i, i)) -for i in range(0, num_regs): - outFile.write('def v2b16_%d : NVPTXReg<"%%v2b16_%d">;\n' % (i, i)) -for i in range(0, num_regs): - outFile.write('def v2b32_%d : NVPTXReg<"%%v2b32_%d">;\n' % (i, i)) -for i in range(0, num_regs): - outFile.write('def v2b64_%d : NVPTXReg<"%%v2b64_%d">;\n' % (i, i)) - -for i in range(0, num_regs): - outFile.write('def v4b8_%d : NVPTXReg<"%%v4b8_%d">;\n' % (i, i)) -for i in range(0, num_regs): - outFile.write('def v4b16_%d : NVPTXReg<"%%v4b16_%d">;\n' % (i, i)) -for i in range(0, num_regs): - outFile.write('def v4b32_%d : NVPTXReg<"%%v4b32_%d">;\n' % (i, i)) - -# Argument registers -outFile.write(''' -//===--- Arguments --------------------------------------------------------===// -''') -for i in range(0, num_regs): - outFile.write('def ia%d : NVPTXReg<"%%ia%d">;\n' % (i, i)) -for i in range(0, num_regs): - outFile.write('def la%d : NVPTXReg<"%%la%d">;\n' % (i, i)) -for i in range(0, num_regs): - outFile.write('def fa%d : NVPTXReg<"%%fa%d">;\n' % (i, i)) -for i in range(0, num_regs): - outFile.write('def da%d : NVPTXReg<"%%da%d">;\n' % (i, i)) - -outFile.write(''' -//===----------------------------------------------------------------------===// -// Register classes -//===----------------------------------------------------------------------===// -''') - -outFile.write('def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%%u", 0, %d))>;\n' % (num_regs-1)) -outFile.write('def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%%u", 0, %d))>;\n' % (num_regs-1)) -outFile.write('def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%%u", 0, %d))>;\n' % (num_regs-1)) -outFile.write('def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%%u", 0, %d))>;\n' % (num_regs-1)) -outFile.write('def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%%u", 0, %d))>;\n' % (num_regs-1)) - -outFile.write('def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%%u", 0, %d))>;\n' % (num_regs-1)) -outFile.write('def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%%u", 0, %d))>;\n' % (num_regs-1)) - -outFile.write('def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%%u", 0, %d))>;\n' % (num_regs-1)) -outFile.write('def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%%u", 0, %d))>;\n' % (num_regs-1)) -outFile.write('def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%%u", 0, %d))>;\n' % (num_regs-1)) -outFile.write('def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%%u", 0, %d))>;\n' % (num_regs-1)) - -outFile.write(''' -// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. -def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>; -''') - -outFile.write(''' -class NVPTXVecRegClass regTypes, int alignment, dag regList, - NVPTXRegClass sClass, - int e, - string n> - : NVPTXRegClass -{ - NVPTXRegClass scalarClass=sClass; - int elems=e; - string name=n; -} -''') - - -outFile.write('def V2F32Regs\n : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n Float32Regs, 2, ".v2.f32">;\n' % (num_regs-1)) -outFile.write('def V4F32Regs\n : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n Float32Regs, 4, ".v4.f32">;\n' % (num_regs-1)) - -outFile.write('def V2I32Regs\n : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n Int32Regs, 2, ".v2.u32">;\n' % (num_regs-1)) -outFile.write('def V4I32Regs\n : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n Int32Regs, 4, ".v4.u32">;\n' % (num_regs-1)) - -outFile.write('def V2F64Regs\n : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n Float64Regs, 2, ".v2.f64">;\n' % (num_regs-1)) -outFile.write('def V2I64Regs\n : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n Int64Regs, 2, ".v2.u64">;\n' % (num_regs-1)) - -outFile.write('def V2I16Regs\n : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%%u", 0, %d)),\n Int16Regs, 2, ".v2.u16">;\n' % (num_regs-1)) -outFile.write('def V4I16Regs\n : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%%u", 0, %d)),\n Int16Regs, 4, ".v4.u16">;\n' % (num_regs-1)) - -outFile.write('def V2I8Regs\n : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%%u", 0, %d)),\n Int8Regs, 2, ".v2.u8">;\n' % (num_regs-1)) -outFile.write('def V4I8Regs\n : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%%u", 0, %d)),\n Int8Regs, 4, ".v4.u8">;\n' % (num_regs-1)) - -outFile.close() - - -outFile = open('NVPTXNumRegisters.h', 'w') -outFile.write(''' -//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef NVPTX_NUM_REGISTERS_H -#define NVPTX_NUM_REGISTERS_H - -namespace llvm { - -const unsigned NVPTXNumRegisters = %d; - -} - -#endif -''' % num_regs) - -outFile.close() diff --git a/test/CodeGen/NVPTX/vector-loads.ll b/test/CodeGen/NVPTX/vector-loads.ll new file mode 100644 index 00000000000..f5a1795e3c2 --- /dev/null +++ b/test/CodeGen/NVPTX/vector-loads.ll @@ -0,0 +1,66 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; Even though general vector types are not supported in PTX, we can still +; optimize loads/stores with pseudo-vector instructions of the form: +; +; ld.v2.f32 {%f0, %f1}, [%r0] +; +; which will load two floats at once into scalar registers. + +define void @foo(<2 x float>* %a) { +; CHECK: .func foo +; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%r{{[0-9]+}}]; + %t1 = load <2 x float>* %a + %t2 = fmul <2 x float> %t1, %t1 + store <2 x float> %t2, <2 x float>* %a + ret void +} + +define void @foo2(<4 x float>* %a) { +; CHECK: .func foo2 +; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%r{{[0-9]+}}]; + %t1 = load <4 x float>* %a + %t2 = fmul <4 x float> %t1, %t1 + store <4 x float> %t2, <4 x float>* %a + ret void +} + +define void @foo3(<8 x float>* %a) { +; CHECK: .func foo3 +; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%r{{[0-9]+}}]; +; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%r{{[0-9]+}}+16]; + %t1 = load <8 x float>* %a + %t2 = fmul <8 x float> %t1, %t1 + store <8 x float> %t2, <8 x float>* %a + ret void +} + + + +define void @foo4(<2 x i32>* %a) { +; CHECK: .func foo4 +; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%r{{[0-9]+}}]; + %t1 = load <2 x i32>* %a + %t2 = mul <2 x i32> %t1, %t1 + store <2 x i32> %t2, <2 x i32>* %a + ret void +} + +define void @foo5(<4 x i32>* %a) { +; CHECK: .func foo5 +; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%r{{[0-9]+}}]; + %t1 = load <4 x i32>* %a + %t2 = mul <4 x i32> %t1, %t1 + store <4 x i32> %t2, <4 x i32>* %a + ret void +} + +define void @foo6(<8 x i32>* %a) { +; CHECK: .func foo6 +; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%r{{[0-9]+}}]; +; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%r{{[0-9]+}}+16]; + %t1 = load <8 x i32>* %a + %t2 = mul <8 x i32> %t1, %t1 + store <8 x i32> %t2, <8 x i32>* %a + ret void +} -- 2.11.0