OSDN Git Service

drm/amdgpu: bypass querying ras error count registers
authorGuchun Chen <guchun.chen@amd.com>
Tue, 4 Aug 2020 07:00:53 +0000 (15:00 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 14 Aug 2020 20:12:22 +0000 (16:12 -0400)
Once ras recovery is issued by ras sync flood interrupt or
ras controller interrupt, add this guard to bypass or execute
ras error count register harvest of all IPs.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

index 1a55f6f..fbe464c 100644 (file)
@@ -1547,17 +1547,19 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
        struct list_head device_list, *device_list_handle =  NULL;
        struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
 
-       /* Build list of devices to query RAS related errors */
-       if  (hive && adev->gmc.xgmi.num_physical_nodes > 1)
-               device_list_handle = &hive->device_list;
-       else {
-               INIT_LIST_HEAD(&device_list);
-               list_add_tail(&adev->gmc.xgmi.head, &device_list);
-               device_list_handle = &device_list;
-       }
+       if (!ras->disable_ras_err_cnt_harvest) {
+               /* Build list of devices to query RAS related errors */
+               if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
+                       device_list_handle = &hive->device_list;
+               } else {
+                       INIT_LIST_HEAD(&device_list);
+                       list_add_tail(&adev->gmc.xgmi.head, &device_list);
+                       device_list_handle = &device_list;
+               }
 
-       list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) {
-               amdgpu_ras_log_on_err_counter(remote_adev);
+               list_for_each_entry(remote_adev,
+                               device_list_handle, gmc.xgmi.head)
+                       amdgpu_ras_log_on_err_counter(remote_adev);
        }
 
        if (amdgpu_device_should_recover_gpu(ras->adev))
index 70a6fca..6b8d7bb 100644 (file)
@@ -343,6 +343,9 @@ struct amdgpu_ras {
 
        /* bad page count threshold */
        uint32_t bad_page_cnt_threshold;
+
+       /* disable ras error count harvest in recovery */
+       bool disable_ras_err_cnt_harvest;
 };
 
 struct ras_fs_data {
index e629156..eadc952 100644 (file)
@@ -302,6 +302,7 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
        uint32_t bif_doorbell_intr_cntl;
        struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
        struct ras_err_data err_data = {0, 0, 0, NULL};
+       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
        bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL);
        if (REG_GET_FIELD(bif_doorbell_intr_cntl,
@@ -312,28 +313,31 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
                                                RAS_CNTLR_INTERRUPT_CLEAR, 1);
                WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
 
-               /*
-                * clear error status after ras_controller_intr according to
-                * hw team and count ue number for query
-                */
-               nbio_v7_4_query_ras_error_count(adev, &err_data);
-
-               /* logging on error counter and printing for awareness */
-               obj->err_data.ue_count += err_data.ue_count;
-               obj->err_data.ce_count += err_data.ce_count;
-
-               if (err_data.ce_count)
-                       dev_info(adev->dev, "%ld correctable hardware "
-                                       "errors detected in %s block, "
-                                       "no user action is needed.\n",
-                                       obj->err_data.ce_count,
-                                       adev->nbio.ras_if->name);
-
-               if (err_data.ue_count)
-                       dev_info(adev->dev, "%ld uncorrectable hardware "
-                                       "errors detected in %s block\n",
-                                       obj->err_data.ue_count,
-                                       adev->nbio.ras_if->name);
+               if (!ras->disable_ras_err_cnt_harvest) {
+                       /*
+                        * clear error status after ras_controller_intr
+                        * according to hw team and count ue number
+                        * for query
+                        */
+                       nbio_v7_4_query_ras_error_count(adev, &err_data);
+
+                       /* logging on error cnt and printing for awareness */
+                       obj->err_data.ue_count += err_data.ue_count;
+                       obj->err_data.ce_count += err_data.ce_count;
+
+                       if (err_data.ce_count)
+                               dev_info(adev->dev, "%ld correctable hardware "
+                                               "errors detected in %s block, "
+                                               "no user action is needed.\n",
+                                               obj->err_data.ce_count,
+                                               adev->nbio.ras_if->name);
+
+                       if (err_data.ue_count)
+                               dev_info(adev->dev, "%ld uncorrectable hardware "
+                                               "errors detected in %s block\n",
+                                               obj->err_data.ue_count,
+                                               adev->nbio.ras_if->name);
+               }
 
                dev_info(adev->dev, "RAS controller interrupt triggered "
                                        "by NBIF error\n");