OSDN Git Service

drm/amdgpu: skip umc ras irq handling in poison mode (v2)
authorTao Zhou <tao.zhou1@amd.com>
Fri, 17 Sep 2021 10:40:57 +0000 (18:40 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 28 Sep 2021 13:30:07 +0000 (09:30 -0400)
In ras poison mode, umc uncorrectable error will be ignored until
the corrupted data consumed by another ras module (such as gfx, sdma).

v2: update the debug message and replace dev_warn with dev_info.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 4c547ee..8243f79 100644 (file)
@@ -1544,22 +1544,28 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
                data->rptr = (data->aligned_element_size +
                                data->rptr) % data->ring_size;
 
-               /* Let IP handle its data, maybe we need get the output
-                * from the callback to udpate the error type/count, etc
-                */
                if (data->cb) {
-                       ret = data->cb(obj->adev, &err_data, &entry);
-                       /* ue will trigger an interrupt, and in that case
-                        * we need do a reset to recovery the whole system.
-                        * But leave IP do that recovery, here we just dispatch
-                        * the error.
-                        */
-                       if (ret == AMDGPU_RAS_SUCCESS) {
-                               /* these counts could be left as 0 if
-                                * some blocks do not count error number
+                       if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
+                           obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+                               dev_info(obj->adev->dev,
+                                               "Poison is created, no user action is needed.\n");
+                       else {
+                               /* Let IP handle its data, maybe we need get the output
+                                * from the callback to udpate the error type/count, etc
+                                */
+                               ret = data->cb(obj->adev, &err_data, &entry);
+                               /* ue will trigger an interrupt, and in that case
+                                * we need do a reset to recovery the whole system.
+                                * But leave IP do that recovery, here we just dispatch
+                                * the error.
                                 */
-                               obj->err_data.ue_count += err_data.ue_count;
-                               obj->err_data.ce_count += err_data.ce_count;
+                               if (ret == AMDGPU_RAS_SUCCESS) {
+                                       /* these counts could be left as 0 if
+                                        * some blocks do not count error number
+                                        */
+                                       obj->err_data.ue_count += err_data.ue_count;
+                                       obj->err_data.ce_count += err_data.ce_count;
+                               }
                        }
                }
        }