drm/amdgpu: validate bad page threshold in ras(v3)

[uclinux-h8/linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_ras.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 5680f7e..6660094 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -69,6 +69,9 @@ const char *ras_block_string[] = {
  /* inject address is 52 bits */
  #define        RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
  
+/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
+#define RAS_BAD_PAGE_RATE              (100 * 1024 * 1024ULL)
+
  enum amdgpu_ras_retire_page_reservation {
         AMDGPU_RAS_RETIRE_PAGE_RESERVED,
         AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -1699,6 +1702,47 @@ out:
         return ret;
  }
  
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+                                       uint32_t max_length)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       int tmp_threshold = amdgpu_bad_page_threshold;
+       u64 val;
+
+       /*
+        * Justification of value bad_page_cnt_threshold in ras structure
+        *
+        * Generally, -1 <= amdgpu_bad_page_threshold <= max record length
+        * in eeprom, and introduce two scenarios accordingly.
+        *
+        * Bad page retirement enablement:
+        *    - If amdgpu_bad_page_threshold = -1,
+        *      bad_page_cnt_threshold = typical value by formula.
+        *
+        *    - When the value from user is 0 < amdgpu_bad_page_threshold <
+        *      max record length in eeprom, use it directly.
+        *
+        * Bad page retirement disablement:
+        *    - If amdgpu_bad_page_threshold = 0, bad page retirement
+        *      functionality is disabled, and bad_page_cnt_threshold will
+        *      take no effect.
+        */
+
+       if (tmp_threshold < -1)
+               tmp_threshold = -1;
+       else if (tmp_threshold > max_length)
+               tmp_threshold = max_length;
+
+       if (tmp_threshold == -1) {
+               val = adev->gmc.mc_vram_size;
+               do_div(val, RAS_BAD_PAGE_RATE);
+               con->bad_page_cnt_threshold = min(lower_32_bits(val),
+                                               max_length);
+       } else {
+               con->bad_page_cnt_threshold = tmp_threshold;
+       }
+}
+
  /* called in gpu recovery/init */
  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
  {
@@ -1776,6 +1820,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
  {
         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
         struct ras_err_handler_data **data;
+       uint32_t max_eeprom_records_len = 0;
         int ret;
  
         if (con)
@@ -1794,6 +1839,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         atomic_set(&con->in_recovery, 0);
         con->adev = adev;
  
+       max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+       amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+
         ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
         if (ret)
                 goto free;