OSDN Git Service

drm/amdgpu: set poison supported flag for RAS (v2)
authorTao Zhou <tao.zhou1@amd.com>
Fri, 17 Sep 2021 10:24:09 +0000 (18:24 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Tue, 28 Sep 2021 13:30:07 +0000 (09:30 -0400)
Add RAS poison supported flag and tell PSP RAS TA about the info.

v2: rename poison mode to poison supported, we can also disable poison
mode even we support it.
    print value of poison supported if ras feature enablement fails.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

index 17d0977..f17a103 100644 (file)
@@ -1444,9 +1444,9 @@ static int psp_ras_initialize(struct psp_context *psp)
        ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
        memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
 
-       if (psp->adev->gmc.xgmi.connected_to_cpu)
+       if (amdgpu_ras_is_poison_mode_supported(adev))
                ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
-       else
+       if (!adev->gmc.xgmi.connected_to_cpu)
                ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
 
        ret = psp_ras_load(psp);
index e1c34ee..4c547ee 100644 (file)
@@ -710,10 +710,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
        if (!amdgpu_ras_intr_triggered()) {
                ret = psp_ras_enable_features(&adev->psp, info, enable);
                if (ret) {
-                       dev_err(adev->dev, "ras %s %s failed %d\n",
+                       dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
                                enable ? "enable":"disable",
                                get_ras_block_str(head),
-                               ret);
+                               amdgpu_ras_is_poison_mode_supported(adev), ret);
                        goto out;
                }
        }
@@ -2238,6 +2238,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        int r;
+       bool df_poison, umc_poison;
 
        if (con)
                return 0;
@@ -2308,6 +2309,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
                        goto release_con;
        }
 
+       /* Init poison supported flag, the default value is false */
+       if (adev->df.funcs &&
+           adev->df.funcs->query_ras_poison_mode &&
+           adev->umc.ras_funcs &&
+           adev->umc.ras_funcs->query_ras_poison_mode) {
+               df_poison =
+                       adev->df.funcs->query_ras_poison_mode(adev);
+               umc_poison =
+                       adev->umc.ras_funcs->query_ras_poison_mode(adev);
+               /* Only poison is set in both DF and UMC, we can support it */
+               if (df_poison && umc_poison)
+                       con->poison_supported = true;
+               else if (df_poison != umc_poison)
+                       dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+                                       df_poison, umc_poison);
+       }
+
        if (amdgpu_ras_fs_init(adev)) {
                r = -EINVAL;
                goto release_con;
@@ -2351,6 +2369,16 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
        return 0;
 }
 
+bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!con)
+               return false;
+
+       return con->poison_supported;
+}
+
 /* helper function to handle common stuff in ip late init phase */
 int amdgpu_ras_late_init(struct amdgpu_device *adev,
                         struct ras_common_if *ras_block,
index 37b3c40..e36f4de 100644 (file)
@@ -351,6 +351,9 @@ struct amdgpu_ras {
        /* disable ras error count harvest in recovery */
        bool disable_ras_err_cnt_harvest;
 
+       /* is poison mode supported */
+       bool poison_supported;
+
        /* RAS count errors delayed work */
        struct delayed_work ras_counte_delay_work;
        atomic_t ras_ue_count;
@@ -646,4 +649,6 @@ int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
 
 const char *get_ras_block_str(struct ras_common_if *ras_block);
 
+bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
+
 #endif