drm/amdgpu: break GPU recovery once it's in bad state(v4)

author Guchun Chen <guchun.chen@amd.com>

Thu, 23 Jul 2020 08:20:02 +0000 (16:20 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 4 Aug 2020 21:26:54 +0000 (17:26 -0400)
author Guchun Chen <guchun.chen@amd.com>
Thu, 23 Jul 2020 08:20:02 +0000 (16:20 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 4 Aug 2020 21:26:54 +0000 (17:26 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 905c5ab..d01d796 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4125,8 +4125,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
  
                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
  
-                               /* must succeed. */
-                               amdgpu_ras_resume(tmp_adev);
+                               /*
+                                * The GPU enters bad state once faulty pages
+                                * by ECC has reached the threshold, and ras
+                                * recovery is scheduled next. So add one check
+                                * here to break recovery if it indeed exceeds
+                                * bad page threshold, and remind user to
+                                * retire this GPU or setting one bigger
+                                * bad_page_threshold value to fix this once
+                                * probing driver again.
+                                */
+                               if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+                                       /* must succeed. */
+                                       amdgpu_ras_resume(tmp_adev);
+                               } else {
+                                       r = -EINVAL;
+                                       goto out;
+                               }
  
                                 /* Update PSP FW topology after reset */
                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
@@ -4134,7 +4149,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                         }
                 }
  
-
  out:
                 if (!r) {
                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index d081de2..ab65dfd 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2205,3 +2205,19 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
  
         return false;
  }
+
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       bool exc_err_limit = false;
+
+       if (con && (amdgpu_bad_page_threshold != 0))
+               amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control,
+                                               &exc_err_limit);
+
+       /*
+        * We are only interested in variable exc_err_limit,
+        * as it says if GPU is in bad state or not.
+        */
+       return exc_err_limit;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index cf9f602..70a6fca 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
  unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
                 bool is_ce);
  
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
+
  /* error handling functions */
  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                 struct eeprom_table_record *bps, int pages);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

index 461dfd2..7848a42 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
         return curr_address;
  }
  
+int amdgpu_ras_eeprom_check_err_threshold(
+                               struct amdgpu_ras_eeprom_control *control,
+                               bool *exceed_err_limit)
+{
+       struct amdgpu_device *adev = to_amdgpu_device(control);
+       unsigned char buff[EEPROM_ADDRESS_SIZE +
+                       EEPROM_TABLE_HEADER_SIZE] = { 0 };
+       struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+       struct i2c_msg msg = {
+                       .addr = control->i2c_address,
+                       .flags = I2C_M_RD,
+                       .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
+                       .buf = buff,
+       };
+       int ret;
+
+       *exceed_err_limit = false;
+
+       /* read EEPROM table header */
+       mutex_lock(&control->tbl_mutex);
+       ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
+       if (ret < 1) {
+               dev_err(adev->dev, "Failed to read EEPROM table header.\n");
+               goto err;
+       }
+
+       __decode_table_header_from_buff(hdr, &buff[2]);
+
+       if (hdr->header == EEPROM_TABLE_HDR_BAD) {
+               dev_warn(adev->dev, "This GPU is in BAD status.");
+               dev_warn(adev->dev, "Please retire it or setting one bigger "
+                               "threshold value when reloading driver.\n");
+               *exceed_err_limit = true;
+       }
+
+err:
+       mutex_unlock(&control->tbl_mutex);
+       return 0;
+}
+
  int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
                                             struct eeprom_table_record *records,
                                             bool write,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index 9839b4e..c7a5e5c 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -80,6 +80,10 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
                         bool *exceed_err_limit);
  int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
  
+int amdgpu_ras_eeprom_check_err_threshold(
+                               struct amdgpu_ras_eeprom_control *control,
+                               bool *exceed_err_limit);
+
  int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
                                             struct eeprom_table_record *records,
                                             bool write,
author	Guchun Chen <guchun.chen@amd.com>
	Thu, 23 Jul 2020 08:20:02 +0000 (16:20 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 4 Aug 2020 21:26:54 +0000 (17:26 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h		patch \| blob \| history