From a1535e3b9b365c028c0ab56c4505b8afc8d6a86a Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 16 Jul 2014 19:45:35 +0000 Subject: [PATCH] [NVPTX] Honor alignment on vector loads/stores We were not considering the stated alignment on vector loads/stores, leading us to generate vector instructions even when we do not have sufficient alignment. Now, for IR like: %1 = load <4 x float>, <4 x float>* %ptr, align 4 we will generate correct, conservative PTX like: ld.f32 ... [%ptr] ld.f32 ... [%ptr+4] ld.f32 ... [%ptr+8] ld.f32 ... [%ptr+12] Or if we have an alignment of 8 (for example), we can generate code like: ld.v2.f32 ... [%ptr] ld.v2.f32 ... [%ptr+8] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213186 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/NVPTX/NVPTXISelLowering.cpp | 36 +++++++++++-- test/CodeGen/NVPTX/misaligned-vector-ldst.ll | 77 ++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 5 deletions(-) create mode 100644 test/CodeGen/NVPTX/misaligned-vector-ldst.ll diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index cb452ff0725..91e24bb617d 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1494,6 +1494,21 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { break; } + MemSDNode *MemSD = cast(N); + const DataLayout *TD = getDataLayout(); + + unsigned Align = MemSD->getAlignment(); + unsigned PrefAlign = + TD->getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This store is not sufficiently aligned, so bail out and let this vector + // store be scalarized. Note that we may still be able to emit smaller + // vector stores. For example, if we are storing a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return SDValue(); + } + unsigned Opcode = 0; EVT EltVT = ValVT.getVectorElementType(); unsigned NumElts = ValVT.getVectorNumElements(); @@ -1536,8 +1551,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { Ops.push_back(N->getOperand(i)); } - MemSDNode *MemSD = cast(N); - SDValue NewSt = DAG.getMemIntrinsicNode( Opcode, DL, DAG.getVTList(MVT::Other), Ops, MemSD->getMemoryVT(), MemSD->getMemOperand()); @@ -3046,6 +3059,7 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, + const DataLayout *TD, SmallVectorImpl &Results) { EVT ResVT = N->getValueType(0); SDLoc DL(N); @@ -3073,6 +3087,20 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, break; } + LoadSDNode *LD = cast(N); + + unsigned Align = LD->getAlignment(); + unsigned PrefAlign = + TD->getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); + if (Align < PrefAlign) { + // This load is not sufficiently aligned, so bail out and let this vector + // load be scalarized. Note that we may still be able to emit smaller + // vector loads. For example, if we are loading a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return; + } + EVT EltVT = ResVT.getVectorElementType(); unsigned NumElts = ResVT.getVectorNumElements(); @@ -3109,8 +3137,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) OtherOps.push_back(N->getOperand(i)); - LoadSDNode *LD = cast(N); - // The select routine does not have access to the LoadSDNode instance, so // pass along the extension information OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType())); @@ -3283,7 +3309,7 @@ void NVPTXTargetLowering::ReplaceNodeResults( default: report_fatal_error("Unhandled custom legalization"); case ISD::LOAD: - ReplaceLoadVector(N, DAG, Results); + ReplaceLoadVector(N, DAG, getDataLayout(), Results); return; case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll new file mode 100644 index 00000000000..90c9c4306de --- /dev/null +++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -0,0 +1,77 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: t1 +define <4 x float> @t1(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK-NOT: ld.v2 +; CHECK-NOT: ld.f32 +; CHECK: ld.u8 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 1 + ret <4 x float> %r +} + +; CHECK-LABEL: t2 +define <4 x float> @t2(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK-NOT: ld.v2 +; CHECK: ld.f32 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 4 + ret <4 x float> %r +} + +; CHECK-LABEL: t3 +define <4 x float> @t3(i8* %p1) { +; CHECK-NOT: ld.v4 +; CHECK: ld.v2 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 8 + ret <4 x float> %r +} + +; CHECK-LABEL: t4 +define <4 x float> @t4(i8* %p1) { +; CHECK: ld.v4 + %cast = bitcast i8* %p1 to <4 x float>* + %r = load <4 x float>* %cast, align 16 + ret <4 x float> %r +} + + +; CHECK-LABEL: s1 +define void @s1(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 +; CHECK-NOT: st.v2 +; CHECK-NOT: st.f32 +; CHECK: st.u8 + store <4 x float> %v, <4 x float>* %p1, align 1 + ret void +} + +; CHECK-LABEL: s2 +define void @s2(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 +; CHECK-NOT: st.v2 +; CHECK: st.f32 + store <4 x float> %v, <4 x float>* %p1, align 4 + ret void +} + +; CHECK-LABEL: s3 +define void @s3(<4 x float>* %p1, <4 x float> %v) { +; CHECK-NOT: st.v4 + store <4 x float> %v, <4 x float>* %p1, align 8 + ret void +} + +; CHECK-LABEL: s4 +define void @s4(<4 x float>* %p1, <4 x float> %v) { +; CHECK: st.v4 + store <4 x float> %v, <4 x float>* %p1, align 16 + ret void +} + -- 2.11.0