R600/SI: Allow commuting some 3 op instructions

author Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 13 Nov 2014 19:26:47 +0000 (19:26 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 13 Nov 2014 19:26:47 +0000 (19:26 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 13 Nov 2014 19:26:47 +0000 (19:26 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 13 Nov 2014 19:26:47 +0000 (19:26 +0000)
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td

index 3842f84..33d2d07 100644 (file)
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1374,12 +1374,12 @@ defm V_SUBREV_F32 : VOP2Inst <vop2<0x5>, "v_subrev_f32",
  >;
  } // End isCommutable = 1
  
+let isCommutable = 1 in {
+
  defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
    VOP_F32_F32_F32
  >;
  
-let isCommutable = 1 in {
-
  defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32",
    VOP_F32_F32_F32, int_AMDGPU_mul
  >;
@@ -1388,7 +1388,6 @@ defm V_MUL_F32 : VOP2Inst <vop2<0x8>, "v_mul_f32",
    VOP_F32_F32_F32, fmul
  >;
  
-
  defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24",
    VOP_I32_I32_I32, AMDGPUmul_i24
  >;
@@ -1449,11 +1448,21 @@ defm V_XOR_B32 : VOP2Inst <vop2<0x1d>, "v_xor_b32",
  
  defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32",
    VOP_I32_I32_I32, AMDGPUbfm>;
+
+let isCommutable = 1 in {
  defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
  defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>;
+
+let isCommutable = 1 in {
  defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
+
  defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
  defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32",
+
    VOP_I32_I32_I32
  >;
  defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32",
@@ -1503,18 +1512,22 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <vop2<0x2f>, "v_cvt_pkrtz_f16_f32",
  // VOP3 Instructions
  //===----------------------------------------------------------------------===//
  
+let isCommutable = 1 in {
  defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32",
    VOP_F32_F32_F32_F32
  >;
+
  defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32",
    VOP_F32_F32_F32_F32, fmad
  >;
+
  defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24",
    VOP_I32_I32_I32_I32, AMDGPUmad_i24
  >;
  defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24",
    VOP_I32_I32_I32_I32, AMDGPUmad_u24
  >;
+} // End isCommutable = 1
  
  defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32",
    VOP_F32_F32_F32_F32
@@ -1537,12 +1550,16 @@ defm V_BFE_I32 : VOP3Inst <vop3<0x149>, "v_bfe_i32",
  defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32",
    VOP_I32_I32_I32_I32, AMDGPUbfi
  >;
+
+let isCommutable = 1 in {
  defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32",
    VOP_F32_F32_F32_F32, fma
  >;
  defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64",
    VOP_F64_F64_F64_F64, fma
  >;
+} // End isCommutable = 1
+
  //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
  defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32",
    VOP_I32_I32_I32_I32
@@ -1629,15 +1646,19 @@ defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d>, "v_div_scale_f32", []>;
  // Double precision division pre-scale.
  defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>;
  
+let isCommutable = 1 in {
  defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32",
    VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
  >;
  defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64",
    VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
  >;
+} // End isCommutable = 1
+
  //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
  //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
  //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
+
  defm V_TRIG_PREOP_F64 : VOP3Inst <
    vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
  >;
@@ -2840,6 +2861,8 @@ defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8",
  defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
    VOP_I32_I32_I32
  >;
+
+let isCommutable = 1 in {
  defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
    VOP_I64_I32_I32_I64
  >;
@@ -2848,6 +2871,7 @@ defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
  defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
    VOP_I64_I32_I32_I64
  >;
+} // End isCommutable = 1
  
  // Remaining instructions:
  // FLAT_*
diff --git a/test/CodeGen/R600/commute_modifiers.ll b/test/CodeGen/R600/commute_modifiers.ll

index 2504688..f2885a7 100644 (file)
--- a/test/CodeGen/R600/commute_modifiers.ll
+++ b/test/CodeGen/R600/commute_modifiers.ll
@@ -46,5 +46,111 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
    ret void
  }
  
+; FIXME: Should use SGPR for literal.
+; FUNC-LABEL: @commute_add_lit_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %x = load float addrspace(1)* %gep.0
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %z = fadd float 1024.0, %x.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_add_fabs_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fadd float %x, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fneg = fsub float -0.000000e+00, %y
+  %z = fmul float %x, %y.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; There's no reason to commute this.
+; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %z = fmul float %x.fabs, %y.fabs
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+  %x = load float addrspace(1)* %gep.0
+  %y = load float addrspace(1)* %gep.1
+  %x.fabs = call float @llvm.fabs.f32(float %x) #1
+  %y.fabs = call float @llvm.fabs.f32(float %y) #1
+  %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+  %z = fmul float %x.fabs, %y.fabs.fneg
+  store float %z, float addrspace(1)* %out
+  ret void
+}
+
  attributes #0 = { nounwind }
  attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll

index d715c99..637e799 100644 (file)
--- a/test/CodeGen/R600/fma.ll
+++ b/test/CodeGen/R600/fma.ll
@@ -5,6 +5,8 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
  declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
  declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
  
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
  ; FUNC-LABEL: {{^}}fma_f32:
  ; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
  
@@ -12,12 +14,12 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounw
  ; EG: FMA {{\*? *}}[[RES]]
  define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                       float addrspace(1)* %in2, float addrspace(1)* %in3) {
-   %r0 = load float addrspace(1)* %in1
-   %r1 = load float addrspace(1)* %in2
-   %r2 = load float addrspace(1)* %in3
-   %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
-   store float %r3, float addrspace(1)* %out
-   ret void
+  %r0 = load float addrspace(1)* %in1
+  %r1 = load float addrspace(1)* %in2
+  %r2 = load float addrspace(1)* %in3
+  %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+  store float %r3, float addrspace(1)* %out
+  ret void
  }
  
  ; FUNC-LABEL: {{^}}fma_v2f32:
@@ -29,12 +31,12 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
  ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
  define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                         <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
-   %r0 = load <2 x float> addrspace(1)* %in1
-   %r1 = load <2 x float> addrspace(1)* %in2
-   %r2 = load <2 x float> addrspace(1)* %in3
-   %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
-   store <2 x float> %r3, <2 x float> addrspace(1)* %out
-   ret void
+  %r0 = load <2 x float> addrspace(1)* %in1
+  %r1 = load <2 x float> addrspace(1)* %in2
+  %r2 = load <2 x float> addrspace(1)* %in3
+  %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
+  store <2 x float> %r3, <2 x float> addrspace(1)* %out
+  ret void
  }
  
  ; FUNC-LABEL: {{^}}fma_v4f32:
@@ -50,10 +52,41 @@ define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)*
  ; EG-DAG: FMA {{\*? *}}[[RES]].W
  define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                         <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
-   %r0 = load <4 x float> addrspace(1)* %in1
-   %r1 = load <4 x float> addrspace(1)* %in2
-   %r2 = load <4 x float> addrspace(1)* %in3
-   %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
-   store <4 x float> %r3, <4 x float> addrspace(1)* %out
-   ret void
+  %r0 = load <4 x float> addrspace(1)* %in1
+  %r1 = load <4 x float> addrspace(1)* %in2
+  %r2 = load <4 x float> addrspace(1)* %in3
+  %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
+  store <4 x float> %r3, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
+; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
+define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %b = load float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @fma_commute_mul_s_f32
+define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+  %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+  %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+  %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+  %a = load float addrspace(1)* %in.a.gep, align 4
+  %c = load float addrspace(1)* %in.b.gep, align 4
+
+  %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+  store float %fma, float addrspace(1)* %out.gep, align 4
+  ret void
  }
diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll

index becc3e4..16003a5 100644 (file)
--- a/test/CodeGen/R600/fmuladd.ll
+++ b/test/CodeGen/R600/fmuladd.ll
@@ -116,7 +116,7 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
  ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
  ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
  ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
  ; CHECK: buffer_store_dword [[RESULT]]
  define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
    %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -158,7 +158,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
  ; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
  ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
  ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
  ; CHECK: buffer_store_dword [[RESULT]]
  define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
    %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll

index 4de1f75..59d6248 100644 (file)
--- a/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umad24.ll
@@ -5,6 +5,7 @@
  ; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
  
  declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
  
  ; FUNC-LABEL: {{^}}test_umad24:
  ; SI: v_mad_u32_u24
@@ -17,3 +18,21 @@ define void @test_umad24(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2
    ret void
  }
  
+; FUNC-LABEL: {{^}}commute_umad24:
+; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+  %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %src0.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+  %src2.gep = getelementptr i32 addrspace(1)* %src0.gep, i32 1
+
+  %src0 = load i32 addrspace(1)* %src0.gep, align 4
+  %src2 = load i32 addrspace(1)* %src2.gep, align 4
+  %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
+  store i32 %mad, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/R600/use-sgpr-multiple-times.ll b/test/CodeGen/R600/use-sgpr-multiple-times.ll

index 2e67fbf..aa94a0e 100644 (file)
--- a/test/CodeGen/R600/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/R600/use-sgpr-multiple-times.ll
@@ -73,7 +73,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, fl
  
  ; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
  ; SI: s_load_dword [[SGPR:s[0-9]+]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
  ; SI: buffer_store_dword [[RESULT]]
  define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
    %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 13 Nov 2014 19:26:47 +0000 (19:26 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 13 Nov 2014 19:26:47 +0000 (19:26 +0000)
lib/Target/R600/SIInstructions.td		patch \| blob \| history
test/CodeGen/R600/commute_modifiers.ll		patch \| blob \| history
test/CodeGen/R600/fma.ll		patch \| blob \| history
test/CodeGen/R600/fmuladd.ll		patch \| blob \| history
test/CodeGen/R600/llvm.AMDGPU.umad24.ll		patch \| blob \| history
test/CodeGen/R600/use-sgpr-multiple-times.ll		patch \| blob \| history