From c84d46707ebbfc303268e27d5aeb8bb4e6d5b1ff Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Wed, 22 Jul 2020 10:00:27 +0800 Subject: [PATCH] drm/amdgpu: validate bad page threshold in ras(v3) Bad page threshold value should be valid in the range between -1 and max records length of eeprom. It could determine when saved bad pages exceed threshold value, and proceed corresponding actions. v2: When using the default typical value, it should be min value between typical value and eeprom max records length. v3: drop the case of setting bad_page_cnt_threshold to be 0xFFFFFFFF, as it confuses user. Signed-off-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 48 ++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 5 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 ++ 4 files changed, 58 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5680f7eafcb1..6660094a1063 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -69,6 +69,9 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ +#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -1699,6 +1702,47 @@ out: return ret; } +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, + uint32_t max_length) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int tmp_threshold = amdgpu_bad_page_threshold; + u64 val; + + /* + * Justification of value bad_page_cnt_threshold in ras structure + * + * Generally, -1 <= amdgpu_bad_page_threshold <= max record length + * in eeprom, and introduce two scenarios accordingly. + * + * Bad page retirement enablement: + * - If amdgpu_bad_page_threshold = -1, + * bad_page_cnt_threshold = typical value by formula. + * + * - When the value from user is 0 < amdgpu_bad_page_threshold < + * max record length in eeprom, use it directly. + * + * Bad page retirement disablement: + * - If amdgpu_bad_page_threshold = 0, bad page retirement + * functionality is disabled, and bad_page_cnt_threshold will + * take no effect. + */ + + if (tmp_threshold < -1) + tmp_threshold = -1; + else if (tmp_threshold > max_length) + tmp_threshold = max_length; + + if (tmp_threshold == -1) { + val = adev->gmc.mc_vram_size; + do_div(val, RAS_BAD_PAGE_RATE); + con->bad_page_cnt_threshold = min(lower_32_bits(val), + max_length); + } else { + con->bad_page_cnt_threshold = tmp_threshold; + } +} + /* called in gpu recovery/init */ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ -1776,6 +1820,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; + uint32_t max_eeprom_records_len = 0; int ret; if (con) @@ -1794,6 +1839,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; + max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); if (ret) goto free; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index b2667342cf67..4672649a9293 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -336,6 +336,9 @@ struct amdgpu_ras { struct amdgpu_ras_eeprom_control eeprom_control; bool error_query_ready; + + /* bad page count threshold */ + uint32_t bad_page_cnt_threshold; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index c0096097bbcf..a2c982b1eac6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -499,6 +499,11 @@ free_buff: return ret == num ? 0 : -EIO; } +inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) +{ + return EEPROM_MAX_RECORD_NUM; +} + /* Used for testing if bugs encountered */ #if 0 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 9e7d640920fb..e285e8cafd48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -84,6 +84,8 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, bool write, int num); +inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void); + void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control); #endif // _AMDGPU_RAS_EEPROM_H -- 2.11.0