R600: Fix min/max matching problems with unordered compares

author Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 12 Dec 2014 02:30:37 +0000 (02:30 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 12 Dec 2014 02:30:37 +0000 (02:30 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 12 Dec 2014 02:30:37 +0000 (02:30 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 12 Dec 2014 02:30:37 +0000 (02:30 +0000)
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp

index 0e34b46..5783d43 100644 (file)
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -1038,17 +1038,21 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
  }
  
  /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
-                                             EVT VT,
-                                             SDValue LHS,
-                                             SDValue RHS,
-                                             SDValue True,
-                                             SDValue False,
-                                             SDValue CC,
-                                             SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
+                                                   EVT VT,
+                                                   SDValue LHS,
+                                                   SDValue RHS,
+                                                   SDValue True,
+                                                   SDValue False,
+                                                   SDValue CC,
+                                                   DAGCombinerInfo &DCI) const {
+  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return SDValue();
+
    if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
      return SDValue();
  
+  SelectionDAG &DAG = DCI.DAG;
    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
    switch (CCOpcode) {
    case ISD::SETOEQ:
@@ -1065,33 +1069,51 @@ SDValue AMDGPUTargetLowering::CombineFMinMax(SDLoc DL,
    case ISD::SETO:
      break;
    case ISD::SETULE:
-  case ISD::SETULT:
+  case ISD::SETULT: {
+    // Unordered.
+    //
+    // We will allow this before legalization since we expand unordered compares
+    // ordinarily.
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+  }
    case ISD::SETOLE:
    case ISD::SETOLT:
    case ISD::SETLE:
    case ISD::SETLT: {
-    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-      break;
+    // Ordered. Assume ordered for undefined.
+
+    // Only do this after legalization to avoid interfering with other combines
+    // which might occur.
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+        !DCI.isCalledByLegalizer())
+      return SDValue();
  
      // We need to permute the operands to get the correct NaN behavior. The
      // selected operand is the second one based on the failing compare with NaN,
      // so permute it based on the compare type the hardware uses.
      if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
-    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+      return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+    return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+  }
+  case ISD::SETUGE:
+  case ISD::SETUGT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
    }
    case ISD::SETGT:
    case ISD::SETGE:
-  case ISD::SETUGE:
    case ISD::SETOGE:
-  case ISD::SETUGT:
    case ISD::SETOGT: {
-    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-      break;
+    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
+        !DCI.isCalledByLegalizer())
+      return SDValue();
  
      if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
-    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
+      return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
+    return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
    }
    case ISD::SETCC_INVALID:
      llvm_unreachable("Invalid setcc condcode!");
@@ -2276,24 +2298,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
        simplifyI24(N1, DCI);
        return SDValue();
      }
-  case ISD::SELECT_CC: {
-    SDLoc DL(N);
-    EVT VT = N->getValueType(0);
-
-    if (VT == MVT::f32 ||
-        (VT == MVT::f64 &&
-         Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
-      SDValue LHS = N->getOperand(0);
-      SDValue RHS = N->getOperand(1);
-      SDValue True = N->getOperand(2);
-      SDValue False = N->getOperand(3);
-      SDValue CC = N->getOperand(4);
-
-      return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
-    }
-
-    break;
-  }
    case ISD::SELECT: {
      SDValue Cond = N->getOperand(0);
      if (Cond.getOpcode() == ISD::SETCC) {
@@ -2306,11 +2310,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
        SDValue True = N->getOperand(1);
        SDValue False = N->getOperand(2);
  
-      if (VT == MVT::f32 ||
-          (VT == MVT::f64 &&
-           Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)) {
-        return CombineFMinMax(DL, VT, LHS, RHS, True, False, CC, DAG);
-      }
+      if (VT == MVT::f32)
+        return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
  
        // TODO: Implement min / max Evergreen instructions.
        if (VT == MVT::i32 &&
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h

index 7386eae..64ec024 100644 (file)
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -145,14 +145,14 @@ public:
  
    SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue CombineFMinMax(SDLoc DL,
-                         EVT VT,
-                         SDValue LHS,
-                         SDValue RHS,
-                         SDValue True,
-                         SDValue False,
-                         SDValue CC,
-                         SelectionDAG &DAG) const;
+  SDValue CombineFMinMaxLegacy(SDLoc DL,
+                               EVT VT,
+                               SDValue LHS,
+                               SDValue RHS,
+                               SDValue True,
+                               SDValue False,
+                               SDValue CC,
+                               DAGCombinerInfo &DCI) const;
    SDValue CombineIMinMax(SDLoc DL,
                           EVT VT,
                           SDValue LHS,
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp

index 1a84546..fb7514e 100644 (file)
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -1118,6 +1118,13 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
    SDValue CC = Op.getOperand(4);
    SDValue Temp;
  
+  if (VT == MVT::f32) {
+    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+    if (MinMax)
+      return MinMax;
+  }
+
    // LHS and RHS are guaranteed to be the same value type
    EVT CompareVT = LHS.getValueType();
  
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td

index aeba351..8c3f140 100644 (file)
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1514,6 +1514,7 @@ let isCommutable = 1 in {
  defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
    VOP_F32_F32_F32
  >;
+} // End isCommutable = 1
  
  defm V_MIN_LEGACY_F32 : VOP2Inst <vop2<0xd>, "v_min_legacy_f32",
    VOP_F32_F32_F32, AMDGPUfmin_legacy
@@ -1522,6 +1523,7 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <vop2<0xe>, "v_max_legacy_f32",
    VOP_F32_F32_F32, AMDGPUfmax_legacy
  >;
  
+let isCommutable = 1 in {
  defm V_LSHR_B32 : VOP2Inst <vop2<0x15>, "v_lshr_b32", VOP_I32_I32_I32, srl>;
  defm V_ASHR_I32 : VOP2Inst <vop2<0x17>, "v_ashr_i32",
    VOP_I32_I32_I32, sra
diff --git a/test/CodeGen/R600/fmax_legacy.f64.ll b/test/CodeGen/R600/fmax_legacy.f64.ll

new file mode 100644 (file)

index 0000000..a615825
--- /dev/null
+++ b/test/CodeGen/R600/fmax_legacy.f64.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; Make sure we don't try to form FMAX_LEGACY nodes with f64
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmax_legacy_uge_f64
+define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp uge double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_oge_f64
+define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp oge double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ugt_f64
+define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ugt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmax_legacy_ogt_f64
+define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ogt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmax_legacy.ll b/test/CodeGen/R600/fmax_legacy.ll

index 8415f34..43e9de1 100644 (file)
--- a/test/CodeGen/R600/fmax_legacy.ll
+++ b/test/CodeGen/R600/fmax_legacy.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
  ; FUNC-LABEL: @test_fmax_legacy_uge_f32
  ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
  ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
  ; EG: MAX
  define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -44,7 +44,7 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(
  ; FUNC-LABEL: @test_fmax_legacy_ugt_f32
  ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
  ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
  ; EG: MAX
  define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.r600.read.tidig.x() #1
diff --git a/test/CodeGen/R600/fmin_legacy.f64.ll b/test/CodeGen/R600/fmin_legacy.f64.ll

new file mode 100644 (file)

index 0000000..51dcd06
--- /dev/null
+++ b/test/CodeGen/R600/fmin_legacy.f64.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; FUNC-LABEL: @test_fmin_legacy_f64
+define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+   %r0 = extractelement <4 x double> %reg0, i32 0
+   %r1 = extractelement <4 x double> %reg0, i32 1
+   %r2 = fcmp uge double %r0, %r1
+   %r3 = select i1 %r2, double %r1, double %r0
+   %vec = insertelement <4 x double> undef, double %r3, i32 0
+   store <4 x double> %vec, <4 x double> addrspace(1)* %out, align 16
+   ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ule_f64
+define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ule double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ole_f64
+define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ole double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_olt_f64
+define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp olt double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @test_fmin_legacy_ult_f64
+define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x() #1
+  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
+  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
+
+  %a = load double addrspace(1)* %gep.0, align 8
+  %b = load double addrspace(1)* %gep.1, align 8
+
+  %cmp = fcmp ult double %a, %b
+  %val = select i1 %cmp, double %a, double %b
+  store double %val, double addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/R600/fmin_legacy.ll b/test/CodeGen/R600/fmin_legacy.ll

index 3f60b60..229d008 100644 (file)
--- a/test/CodeGen/R600/fmin_legacy.ll
+++ b/test/CodeGen/R600/fmin_legacy.ll
@@ -19,7 +19,7 @@ define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> in
  ; FUNC-LABEL: @test_fmin_legacy_ule_f32
  ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
  ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
  define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.r600.read.tidig.x() #1
    %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
@@ -73,7 +73,7 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(
  ; FUNC-LABEL: @test_fmin_legacy_ult_f32
  ; SI: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
  ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
  define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
    %tid = call i32 @llvm.r600.read.tidig.x() #1
    %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 12 Dec 2014 02:30:37 +0000 (02:30 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 12 Dec 2014 02:30:37 +0000 (02:30 +0000)
lib/Target/R600/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/R600/AMDGPUISelLowering.h		patch \| blob \| history
lib/Target/R600/R600ISelLowering.cpp		patch \| blob \| history
lib/Target/R600/SIInstructions.td		patch \| blob \| history
test/CodeGen/R600/fmax_legacy.f64.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/R600/fmax_legacy.ll		patch \| blob \| history
test/CodeGen/R600/fmin_legacy.f64.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/R600/fmin_legacy.ll		patch \| blob \| history