OSDN Git Service

drm/amdgpu: query umc error info from ecc_table v2
authorStanley.Yang <Stanley.Yang@amd.com>
Thu, 18 Nov 2021 08:30:43 +0000 (16:30 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 22 Nov 2021 19:45:46 +0000 (14:45 -0500)
if smu support ECCTABLE, driver can message smu to get ecc_table
then query umc error info from ECCTABLE

v2:
    optimize source code makes logical more reasonable

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

index 53b957a..46910e7 100644 (file)
@@ -892,6 +892,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
        }
 }
 
+static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
+{
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       int ret = 0;
+
+       /*
+        * choosing right query method according to
+        * whether smu support query error information
+        */
+       ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc));
+       if (ret == -EOPNOTSUPP) {
+               if (adev->umc.ras_funcs &&
+                       adev->umc.ras_funcs->query_ras_error_count)
+                       adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
+
+               /* umc query_ras_error_address is also responsible for clearing
+                * error status
+                */
+               if (adev->umc.ras_funcs &&
+                   adev->umc.ras_funcs->query_ras_error_address)
+                       adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
+       } else if (!ret) {
+               if (adev->umc.ras_funcs &&
+                       adev->umc.ras_funcs->ecc_info_query_ras_error_count)
+                       adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data);
+
+               if (adev->umc.ras_funcs &&
+                       adev->umc.ras_funcs->ecc_info_query_ras_error_address)
+                       adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data);
+       }
+}
+
 /* query/inject/cure begin */
 int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
                                  struct ras_query_if *info)
@@ -905,15 +937,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
 
        switch (info->head.block) {
        case AMDGPU_RAS_BLOCK__UMC:
-               if (adev->umc.ras_funcs &&
-                   adev->umc.ras_funcs->query_ras_error_count)
-                       adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
-               /* umc query_ras_error_address is also responsible for clearing
-                * error status
-                */
-               if (adev->umc.ras_funcs &&
-                   adev->umc.ras_funcs->query_ras_error_address)
-                       adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
+               amdgpu_ras_get_ecc_info(adev, &err_data);
                break;
        case AMDGPU_RAS_BLOCK__SDMA:
                if (adev->sdma.funcs->query_ras_error_count) {
index a90029e..6e4bea0 100644 (file)
@@ -94,30 +94,58 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
 {
        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       int ret = 0;
 
        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-       if (adev->umc.ras_funcs &&
-           adev->umc.ras_funcs->query_ras_error_count)
-           adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
-
-       if (adev->umc.ras_funcs &&
-           adev->umc.ras_funcs->query_ras_error_address &&
-           adev->umc.max_ras_err_cnt_per_query) {
-               err_data->err_addr =
-                       kcalloc(adev->umc.max_ras_err_cnt_per_query,
-                               sizeof(struct eeprom_table_record), GFP_KERNEL);
-
-               /* still call query_ras_error_address to clear error status
-                * even NOMEM error is encountered
-                */
-               if(!err_data->err_addr)
-                       dev_warn(adev->dev, "Failed to alloc memory for "
-                                       "umc error address record!\n");
-
-               /* umc query_ras_error_address is also responsible for clearing
-                * error status
-                */
-               adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+       ret = smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc));
+       if (ret == -EOPNOTSUPP) {
+               if (adev->umc.ras_funcs &&
+                   adev->umc.ras_funcs->query_ras_error_count)
+                   adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
+
+               if (adev->umc.ras_funcs &&
+                   adev->umc.ras_funcs->query_ras_error_address &&
+                   adev->umc.max_ras_err_cnt_per_query) {
+                       err_data->err_addr =
+                               kcalloc(adev->umc.max_ras_err_cnt_per_query,
+                                       sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+                       /* still call query_ras_error_address to clear error status
+                        * even NOMEM error is encountered
+                        */
+                       if(!err_data->err_addr)
+                               dev_warn(adev->dev, "Failed to alloc memory for "
+                                               "umc error address record!\n");
+
+                       /* umc query_ras_error_address is also responsible for clearing
+                        * error status
+                        */
+                       adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
+               }
+       } else if (!ret) {
+               if (adev->umc.ras_funcs &&
+                   adev->umc.ras_funcs->ecc_info_query_ras_error_count)
+                   adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, ras_error_status);
+
+               if (adev->umc.ras_funcs &&
+                   adev->umc.ras_funcs->ecc_info_query_ras_error_address &&
+                   adev->umc.max_ras_err_cnt_per_query) {
+                       err_data->err_addr =
+                               kcalloc(adev->umc.max_ras_err_cnt_per_query,
+                                       sizeof(struct eeprom_table_record), GFP_KERNEL);
+
+                       /* still call query_ras_error_address to clear error status
+                        * even NOMEM error is encountered
+                        */
+                       if(!err_data->err_addr)
+                               dev_warn(adev->dev, "Failed to alloc memory for "
+                                               "umc error address record!\n");
+
+                       /* umc query_ras_error_address is also responsible for clearing
+                        * error status
+                        */
+                       adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, ras_error_status);
+               }
        }
 
        /* only uncorrectable error needs gpu reset */