From f50f927d65a97737d3e31229f9006c989bbc1fc4 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 22 Aug 2014 18:49:35 +0000 Subject: [PATCH] R600/SI: Use READ2/WRITE2 instructions for 64-bit mem ops with 32-bit alignment git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216279 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 27 +++++++++++++++ lib/Target/R600/AMDGPUInstructions.td | 11 +++++++ lib/Target/R600/SIInstrInfo.td | 1 + lib/Target/R600/SIInstructions.td | 26 +++++++++++++-- test/CodeGen/R600/unaligned-load-store.ll | 55 +++++++++++++++++++++++++++++-- 5 files changed, 116 insertions(+), 4 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 7911b6f3302..b988d33ffbf 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -91,6 +91,8 @@ private: bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, unsigned OffsetBits) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; + bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1) const; void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, @@ -782,6 +784,31 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, return true; } +bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, + SDValue &Offset0, + SDValue &Offset1) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + ConstantSDNode *C1 = cast(N1); + unsigned DWordOffset0 = C1->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + // (add n0, c0) + if (isDSOffsetLegal(N0, DWordOffset1, 8)) { + Base = N0; + Offset0 = CurDAG->getTargetConstant(DWordOffset0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, MVT::i8); + return true; + } + } + + // default case + Base = Addr; + Offset0 = CurDAG->getTargetConstant(0, MVT::i8); + Offset1 = CurDAG->getTargetConstant(1, MVT::i8); + return true; +} + static SDValue wrapAddr64Rsrc(SelectionDAG *DAG, SDLoc DL, SDValue Ptr) { return SDValue(DAG->getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::v4i32, Ptr), 0); diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 0f2b625dc69..cf3bffac968 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -282,6 +282,17 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isLocalLoad(dyn_cast(N)); }]>; +class Aligned8Bytes : PatFrag (N)->getAlignment() % 8 == 0; +}]>; + +def local_load_aligned8bytes : Aligned8Bytes < + (ops node:$ptr), (local_load node:$ptr) +>; + +def local_store_aligned8bytes : Aligned8Bytes < + (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) +>; class local_binary_atomic_op : PatFrag<(ops node:$ptr, node:$value), diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 5357af97422..064a67efb7a 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -192,6 +192,7 @@ def tfe : Operand { //===----------------------------------------------------------------------===// def DS1Addr1Offset : ComplexPattern; +def DS64Bit4ByteAligned : ComplexPattern; def MUBUFAddr32 : ComplexPattern; def MUBUFAddr64 : ComplexPattern; diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 40fca9f264f..059789223d7 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -2530,7 +2530,18 @@ def : DSReadPat ; def : DSReadPat ; def : DSReadPat ; def : DSReadPat ; -def : DSReadPat ; + +let AddedComplexity = 100 in { + +def : DSReadPat ; + +} // End AddedComplexity = 100 + +def : Pat < + (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1))), + (DS_READ2_B32 (i1 0), $ptr, $offset0, $offset1) +>; class DSWritePat : Pat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), @@ -2540,7 +2551,18 @@ class DSWritePat : Pat < def : DSWritePat ; def : DSWritePat ; def : DSWritePat ; -def : DSWritePat ; + +let AddedComplexity = 100 in { + +def : DSWritePat ; +} // End AddedComplexity = 100 + +def : Pat < + (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), + (DS_WRITE2_B32 (i1 0), $ptr, (EXTRACT_SUBREG $value, sub0), + (EXTRACT_SUBREG $value, sub1), $offset0, $offset1) +>; multiclass DSAtomicRetPat { def : Pat < diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll index 76ea97cd94a..7df7ba00d54 100644 --- a/test/CodeGen/R600/unaligned-load-store.ll +++ b/test/CodeGen/R600/unaligned-load-store.ll @@ -32,9 +32,8 @@ define void @unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> ad ret void } -; FIXME: This should use ds_read2_b32 ; SI-LABEL: @load_lds_i64_align_4 -; SI: DS_READ_B64 +; SI: DS_READ2_B32 ; SI: S_ENDPGM define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { %val = load i64 addrspace(3)* %in, align 4 @@ -42,9 +41,61 @@ define void @load_lds_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspac ret void } +; SI-LABEL: @load_lds_i64_align_4_with_offset +; SI: DS_READ2_B32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}}, 0x8, 0x9 +; SI: S_ENDPGM +define void @load_lds_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %ptr = getelementptr i64 addrspace(3)* %in, i32 4 + %val = load i64 addrspace(3)* %ptr, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + +; SI-LABEL: @load_lds_i64_align_4_with_split_offset +; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits +; SI: DS_READ2_B32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}}, 0x0, 0x1 +; SI: S_ENDPGM +define void @load_lds_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { + %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* + %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255 + %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* + %val = load i64 addrspace(3)* %ptri64, align 4 + store i64 %val, i64 addrspace(1)* %out, align 8 + ret void +} + ; FIXME: Need to fix this case. ; define void @load_lds_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { ; %val = load i64 addrspace(3)* %in, align 1 ; store i64 %val, i64 addrspace(1)* %out, align 8 ; ret void ; } + +; SI-LABEL: @store_lds_i64_align_4 +; SI: DS_WRITE2_B32 +; SI: S_ENDPGM +define void @store_lds_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { + store i64 %val, i64 addrspace(3)* %out, align 4 + ret void +} + +; SI-LABEL: @store_lds_i64_align_4_with_offset +; DS_WRITE_B32 v[{{[0-9]+}}], v[{{[0-9]+}}], v{{[0-9]}}, 0x9, 0x9 +; SI: S_ENDPGM +define void @store_lds_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { + %ptr = getelementptr i64 addrspace(3)* %out, i32 4 + store i64 0, i64 addrspace(3)* %ptr, align 4 + ret void +} + +; SI-LABEL: @store_lds_i64_align_4_with_split_offset +; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits +; DS_WRITE_B32 v[{{[0-9]+}}], v[{{[0-9]+}}], v{{[0-9]}}, 0x0, 0x1 +; SI: S_ENDPGM +define void @store_lds_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { + %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* + %ptr255 = getelementptr i32 addrspace(3)* %ptr, i32 255 + %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* + store i64 0, i64 addrspace(3)* %out, align 4 + ret void +} -- 2.11.0