OSDN Git Service

drm/amdgpu: validate bad page threshold in ras(v3)
authorGuchun Chen <guchun.chen@amd.com>
Wed, 22 Jul 2020 02:00:27 +0000 (10:00 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 4 Aug 2020 21:25:58 +0000 (17:25 -0400)
Bad page threshold value should be valid in the range between
-1 and max records length of eeprom. It could determine when
saved bad pages exceed threshold value, and proceed corresponding
actions.

v2: When using the default typical value, it should be min
value between typical value and eeprom max records length.

v3: drop the case of setting bad_page_cnt_threshold to be
    0xFFFFFFFF, as it confuses user.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

index 5680f7e..6660094 100644 (file)
@@ -69,6 +69,9 @@ const char *ras_block_string[] = {
 /* inject address is 52 bits */
 #define        RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
 
+/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
+#define RAS_BAD_PAGE_RATE              (100 * 1024 * 1024ULL)
+
 enum amdgpu_ras_retire_page_reservation {
        AMDGPU_RAS_RETIRE_PAGE_RESERVED,
        AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -1699,6 +1702,47 @@ out:
        return ret;
 }
 
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+                                       uint32_t max_length)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       int tmp_threshold = amdgpu_bad_page_threshold;
+       u64 val;
+
+       /*
+        * Justification of value bad_page_cnt_threshold in ras structure
+        *
+        * Generally, -1 <= amdgpu_bad_page_threshold <= max record length
+        * in eeprom, and introduce two scenarios accordingly.
+        *
+        * Bad page retirement enablement:
+        *    - If amdgpu_bad_page_threshold = -1,
+        *      bad_page_cnt_threshold = typical value by formula.
+        *
+        *    - When the value from user is 0 < amdgpu_bad_page_threshold <
+        *      max record length in eeprom, use it directly.
+        *
+        * Bad page retirement disablement:
+        *    - If amdgpu_bad_page_threshold = 0, bad page retirement
+        *      functionality is disabled, and bad_page_cnt_threshold will
+        *      take no effect.
+        */
+
+       if (tmp_threshold < -1)
+               tmp_threshold = -1;
+       else if (tmp_threshold > max_length)
+               tmp_threshold = max_length;
+
+       if (tmp_threshold == -1) {
+               val = adev->gmc.mc_vram_size;
+               do_div(val, RAS_BAD_PAGE_RATE);
+               con->bad_page_cnt_threshold = min(lower_32_bits(val),
+                                               max_length);
+       } else {
+               con->bad_page_cnt_threshold = tmp_threshold;
+       }
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 {
@@ -1776,6 +1820,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data **data;
+       uint32_t max_eeprom_records_len = 0;
        int ret;
 
        if (con)
@@ -1794,6 +1839,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
        atomic_set(&con->in_recovery, 0);
        con->adev = adev;
 
+       max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+       amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+
        ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
        if (ret)
                goto free;
index b266734..4672649 100644 (file)
@@ -336,6 +336,9 @@ struct amdgpu_ras {
        struct amdgpu_ras_eeprom_control eeprom_control;
 
        bool error_query_ready;
+
+       /* bad page count threshold */
+       uint32_t bad_page_cnt_threshold;
 };
 
 struct ras_fs_data {
index c009609..a2c982b 100644 (file)
@@ -499,6 +499,11 @@ free_buff:
        return ret == num ? 0 : -EIO;
 }
 
+inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
+{
+       return EEPROM_MAX_RECORD_NUM;
+}
+
 /* Used for testing if bugs encountered */
 #if 0
 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
index 9e7d640..e285e8c 100644 (file)
@@ -84,6 +84,8 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
                                            bool write,
                                            int num);
 
+inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
+
 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);
 
 #endif // _AMDGPU_RAS_EEPROM_H