AMDGPU: Fix selection error on constant loads with < 4 byte alignment

author Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 29 Mar 2018 19:59:28 +0000 (19:59 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 29 Mar 2018 19:59:28 +0000 (19:59 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 29 Mar 2018 19:59:28 +0000 (19:59 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 29 Mar 2018 19:59:28 +0000 (19:59 +0000)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index b034f3d..1de2ab4 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3464,10 +3464,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
    return false;
  }
  
-static bool isDwordAligned(unsigned Alignment) {
-  return Alignment % 4 == 0;
-}
-
  //===----------------------------------------------------------------------===//
  // Custom DAG Lowering Operations
  //===----------------------------------------------------------------------===//
@@ -5385,21 +5381,23 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
           AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
  
    unsigned NumElements = MemVT.getVectorNumElements();
+
    if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
        AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
-    if (!Op->isDivergent())
+    if (!Op->isDivergent() && Alignment >= 4)
        return SDValue();
      // Non-uniform loads will be selected to MUBUF instructions, so they
      // have the same legalization requirements as global and private
      // loads.
      //
    }
+
    if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
        AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
        AS == AMDGPUASI.GLOBAL_ADDRESS) {
      if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
          !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
-        isDwordAligned(Alignment))
+        Alignment >= 4)
        return SDValue();
      // Non-uniform loads will be selected to MUBUF instructions, so they
      // have the same legalization requirements as global and private
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll

index 68ff90e..31bb206 100644 (file)
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -72,6 +72,18 @@ entry:
    ret void
  }
  
+; FUNC-LABEL: {{^}}constant_load_v16i16_align2:
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+define amdgpu_kernel void @constant_load_v16i16_align2(<16 x i16> addrspace(4)* %ptr0) #0 {
+entry:
+  %ld =  load <16 x i16>, <16 x i16> addrspace(4)* %ptr0, align 2
+  store <16 x i16> %ld, <16 x i16> addrspace(1)* undef, align 32
+  ret void
+}
+
  ; FUNC-LABEL: {{^}}constant_zextload_i16_to_i32:
  ; GCN-NOHSA: buffer_load_ushort
  ; GCN-NOHSA: buffer_store_dword
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll

index 6d24334..c1dcc1d 100644 (file)
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -83,6 +83,18 @@ entry:
    ret void
  }
  
+; GCN-LABEL: {{^}}global_load_v16i16_align2:
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_load_dwordx4
+; GCN-HSA: flat_store_dwordx4
+; GCN-HSA: flat_store_dwordx4
+define amdgpu_kernel void @global_load_v16i16_align2(<16 x i16> addrspace(1)* %in, <16 x i16> addrspace(1)* %out) #0 {
+entry:
+  %ld =  load <16 x i16>, <16 x i16> addrspace(1)* %in, align 2
+  store <16 x i16> %ld, <16 x i16> addrspace(1)* %out, align 32
+  ret void
+}
+
  ; FUNC-LABEL: {{^}}global_zextload_i16_to_i32:
  ; GCN-NOHSA: buffer_load_ushort
  ; GCN-NOHSA: buffer_store_dword
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 29 Mar 2018 19:59:28 +0000 (19:59 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 29 Mar 2018 19:59:28 +0000 (19:59 +0000)
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/load-constant-i16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/load-global-i16.ll		patch \| blob \| history