AMDGPU: Use better alignment for kernarg lowering

author Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 30 May 2018 16:17:51 +0000 (16:17 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 30 May 2018 16:17:51 +0000 (16:17 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 30 May 2018 16:17:51 +0000 (16:17 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 30 May 2018 16:17:51 +0000 (16:17 +0000)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 4519b98..cc326cd 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1068,15 +1068,12 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
  SDValue SITargetLowering::lowerKernargMemParameter(
    SelectionDAG &DAG, EVT VT, EVT MemVT,
    const SDLoc &SL, SDValue Chain,
-  uint64_t Offset, bool Signed,
+  uint64_t Offset, unsigned Align, bool Signed,
    const ISD::InputArg *Arg) const {
-  const DataLayout &DL = DAG.getDataLayout();
    Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
    PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
  
-  unsigned Align = DL.getABITypeAlignment(Ty);
-
    SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
    SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
                               MachineMemOperand::MODereferenceable |
@@ -1663,7 +1660,15 @@ SDValue SITargetLowering::LowerFormalArguments(
  
    SmallVector<SDValue, 16> Chains;
  
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+  // FIXME: This is the minimum kernel argument alignment. We should improve
+  // this to the maximum alignment of the arguments.
+  //
+  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
+  // kern arg offset.
+  const unsigned KernelArgBaseAlign = 16;
+  const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn);
+
+   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
      const ISD::InputArg &Arg = Ins[i];
      if (Skipped[i]) {
        InVals.push_back(DAG.getUNDEF(Arg.VT));
@@ -1677,14 +1682,14 @@ SDValue SITargetLowering::LowerFormalArguments(
        VT = Ins[i].VT;
        EVT MemVT = VA.getLocVT();
  
-      const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(Fn) +
-        VA.getLocMemOffset();
+      const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();
        Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+      unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
  
        // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes.
+      // thread group and global sizes for clover.
        SDValue Arg = lowerKernargMemParameter(
-        DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+        DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
        Chains.push_back(Arg.getValue(1));
  
        auto *ParamTy =
@@ -4303,7 +4308,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                                                   unsigned Offset) const {
    SDLoc SL(Op);
    SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
-                                           DAG.getEntryNode(), Offset, false);
+                                           DAG.getEntryNode(), Offset, 4, false);
    // The local size values will have the hi 16-bits as zero.
    return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
                       DAG.getValueType(VT));
@@ -4404,37 +4409,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
        return emitNonHSAIntrinsicError(DAG, DL, VT);
  
      return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_X, false);
+                                    SI::KernelInputOffsets::NGROUPS_X, 4, false);
    case Intrinsic::r600_read_ngroups_y:
      if (Subtarget->isAmdHsaOS())
        return emitNonHSAIntrinsicError(DAG, DL, VT);
  
      return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Y, false);
+                                    SI::KernelInputOffsets::NGROUPS_Y, 4, false);
    case Intrinsic::r600_read_ngroups_z:
      if (Subtarget->isAmdHsaOS())
        return emitNonHSAIntrinsicError(DAG, DL, VT);
  
      return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Z, false);
+                                    SI::KernelInputOffsets::NGROUPS_Z, 4, false);
    case Intrinsic::r600_read_global_size_x:
      if (Subtarget->isAmdHsaOS())
        return emitNonHSAIntrinsicError(DAG, DL, VT);
  
      return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
    case Intrinsic::r600_read_global_size_y:
      if (Subtarget->isAmdHsaOS())
        return emitNonHSAIntrinsicError(DAG, DL, VT);
  
      return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
    case Intrinsic::r600_read_global_size_z:
      if (Subtarget->isAmdHsaOS())
        return emitNonHSAIntrinsicError(DAG, DL, VT);
  
      return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
    case Intrinsic::r600_read_local_size_x:
      if (Subtarget->isAmdHsaOS())
        return emitNonHSAIntrinsicError(DAG, DL, VT);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h

index 1578576..3fdd158 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -27,7 +27,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
    SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
    SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                     const SDLoc &SL, SDValue Chain,
-                                   uint64_t Offset, bool Signed,
+                                   uint64_t Offset, unsigned Align, bool Signed,
                                     const ISD::InputArg *Arg = nullptr) const;
  
    SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll

index 1ece288..d7141ef 100644 (file)
--- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -88,11 +88,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(<2 x half> addrspace(1)*
  ; Combine turns this into integer op when bitcast source (from load)
  
  ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_bc_src:
-; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
  
  ; FIXME: Random commute
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
  ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
  ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
  define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %out, <2 x half> %in) {
@@ -103,16 +101,12 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %ou
  }
  
  ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
-; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
-; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
  
  ; FIXME: Random commute
-; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
  
  ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
  ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll

index a042700..f31b2ab 100644 (file)
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -13,17 +13,10 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
    ret void
  }
  
-; FIXME: Should always be the same
  ; GCN-LABEL: {{^}}load_v2f16_arg:
-; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
-; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
-; SI: v_or_b32_e32 [[PACKED:v[0-9]+]],  [[V0]], [[HI]]
-; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-
-; VI: s_load_dword [[ARG:s[0-9]+]]
-; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
-; VI: buffer_store_dword [[V_ARG]]
+; GCN: s_load_dword [[ARG:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
+; GCN: buffer_store_dword [[V_ARG]]
  define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
    store <2 x half> %arg, <2 x half> addrspace(1)* %out
    ret void
@@ -31,8 +24,8 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha
  
  ; GCN-LABEL: {{^}}load_v3f16_arg:
  ; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: s_load_dword s
+
  ; GCN-NOT: buffer_load
  ; GCN-DAG: buffer_store_dword
  ; GCN-DAG: buffer_store_short
@@ -43,19 +36,14 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
    ret void
  }
  
-; GCN-LABEL: {{^}}load_v4f16_arg:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_dwordx2
  
  ; FIXME: Why not one load?
-; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
-; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
+; GCN-LABEL: {{^}}load_v4f16_arg:
+; GCN-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
+; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
  define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
    store <4 x half> %arg, <4 x half> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll

index f51366f..cb97d71 100644 (file)
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -162,10 +162,11 @@ entry:
  
  ; EG: VTX_READ_8
  ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; HSA: flat_load_ushort
  define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
  entry:
    store <2 x i8> %in, <2 x i8> addrspace(1)* %out
@@ -179,10 +180,9 @@ entry:
  ; EG: VTX_READ_16
  ; EG: VTX_READ_16
  
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-
-; VI: s_load_dword s
+; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
+; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
  define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
  entry:
    store <2 x i16> %in, <2 x i16> addrspace(1)* %out
@@ -226,11 +226,14 @@ entry:
  ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
  ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
  ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; MESA-VI: buffer_load_ushort
+; MESA-VI: buffer_load_ubyte
+
+; HSA-VI: flat_load_ushort
  ; HSA-VI: flat_load_ubyte
  define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
  entry:
@@ -245,12 +248,9 @@ entry:
  ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
  ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
  ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-; MESA-GCN: buffer_load_ushort
-; MESA-GCN: buffer_load_ushort
-; MESA-GCN: buffer_load_ushort
-; HSA-VI: flat_load_ushort
-; HSA-VI: flat_load_ushort
-; HSA-VI: flat_load_ushort
+
+; GCN-DAG: s_load_dword s
+; GCN-DAG: {{buffer|flat}}_load_ushort
  define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
  entry:
    store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
@@ -293,14 +293,13 @@ entry:
  ; EG: VTX_READ_8
  ; EG: VTX_READ_8
  ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dword s
  define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
  entry:
    store <4 x i8> %in, <4 x i8> addrspace(1)* %out
@@ -315,13 +314,14 @@ entry:
  ; EG: VTX_READ_16
  ; EG: VTX_READ_16
  
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
  
-; VI: s_load_dword s
-; VI: s_load_dword s
+; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30
+
+; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8
+; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
  define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
  entry:
    store <4 x i16> %in, <4 x i16> addrspace(1)* %out
@@ -372,21 +372,17 @@ entry:
  ; EG: VTX_READ_8
  ; EG: VTX_READ_8
  ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
  define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
  entry:
    store <8 x i8> %in, <8 x i8> addrspace(1)* %out
@@ -405,15 +401,11 @@ entry:
  ; EG: VTX_READ_16
  ; EG: VTX_READ_16
  
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
  
+; VI: s_load_dwordx2
  ; VI: s_load_dword s
  ; VI: s_load_dword s
  ; VI: s_load_dword s
@@ -481,38 +473,27 @@ entry:
  ; EG: VTX_READ_8
  ; EG: VTX_READ_8
  ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
  define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
  entry:
    store <16 x i8> %in, <16 x i8> addrspace(1)* %out
@@ -539,22 +520,13 @@ entry:
  ; EG: VTX_READ_16
  ; EG: VTX_READ_16
  
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
  
  ; VI: s_load_dword s
  ; VI: s_load_dword s
diff --git a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll

index 601aca4..f2f845b 100644 (file)
--- a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -39,10 +39,8 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)*
  }
  
  ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: s_load_dword s
+; GCN: s_load_dwordx2 s
  ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
  define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
    %x.bc = bitcast <4 x i16> %x to <2 x i32>
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 30 May 2018 16:17:51 +0000 (16:17 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 30 May 2018 16:17:51 +0000 (16:17 +0000)
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.h		patch \| blob \| history
test/CodeGen/AMDGPU/fneg-fabs.f16.ll		patch \| blob \| history
test/CodeGen/AMDGPU/half.ll		patch \| blob \| history
test/CodeGen/AMDGPU/kernel-args.ll		patch \| blob \| history
test/CodeGen/AMDGPU/reduce-store-width-alignment.ll		patch \| blob \| history