From 640ae42efb828be69a9ee6ac88fb3d5a3e678ddf Mon Sep 17 00:00:00 2001 From: John Clements Date: Wed, 22 Sep 2021 14:04:52 +0800 Subject: [PATCH] drm/amdgpu: Updated RAS infrastructure Update RAS infrastructure to support RAS query for MCA subblocks Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 146 ++++++++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 19 +++-- drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 9 +- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 4 +- drivers/gpu/drm/amd/amdgpu/ta_ras_if.h | 11 ++- 7 files changed, 146 insertions(+), 52 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index a2d3dbbf7d25..ce538f4819f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -31,7 +31,7 @@ void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count) { - uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4); + uint64_t mc_status = RREG64_PCIE(mc_status_addr); if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) @@ -42,7 +42,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr, unsigned long *error_count) { - uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4); + uint64_t mc_status = RREG64_PCIE(mc_status_addr); if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || @@ -56,7 +56,7 @@ void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev, void amdgpu_mca_reset_error_count(struct amdgpu_device *adev, uint64_t mc_status_addr) { - WREG64_PCIE(mc_status_addr * 4, 0x0ULL); + WREG64_PCIE(mc_status_addr, 0x0ULL); } void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev, @@ -87,8 +87,8 @@ int amdgpu_mca_ras_late_init(struct amdgpu_device *adev, if (!mca_dev->ras_if) return -ENOMEM; mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block; + mca_dev->ras_if->sub_block_index = mca_dev->ras_funcs->ras_sub_block; mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; - mca_dev->ras_if->sub_block_index = 0; } ih_info.head = fs_info.head = *mca_dev->ras_if; r = amdgpu_ras_late_init(adev, mca_dev->ras_if, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h index f860f2f0e296..c74bc7177066 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h @@ -29,6 +29,7 @@ struct amdgpu_mca_ras_funcs { void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status); uint32_t ras_block; + uint32_t ras_sub_block; const char* sysfs_name; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b5332db4d287..912ea1f9fd04 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -61,9 +61,30 @@ const char *ras_block_string[] = { "mp0", "mp1", "fuse", - "mpio", + "mca", }; +const char *ras_mca_block_string[] = { + "mca_mp0", + "mca_mp1", + "mca_mpio", + "mca_iohc", +}; + +const char *get_ras_block_str(struct ras_common_if *ras_block) +{ + if (!ras_block) + return "NULL"; + + if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT) + return "OUT OF RANGE"; + + if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) + return ras_mca_block_string[ras_block->sub_block_index]; + + return ras_block_string[ras_block->block]; +} + #define ras_err_str(i) (ras_error_string[ffs(i)]) #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) @@ -188,7 +209,7 @@ static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id) for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { *block_id = i; - if (strcmp(name, ras_block_str(i)) == 0) + if (strcmp(name, ras_block_string[i]) == 0) return 0; } return -EINVAL; @@ -510,7 +531,6 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; - if (obj->adev->asic_type == CHIP_ALDEBARAN) { if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) DRM_WARN("Failed to reset error counter and error status"); @@ -530,7 +550,7 @@ static inline void put_obj(struct ras_manager *obj) if (obj && (--obj->use == 0)) list_del(&obj->node); if (obj && (obj->use < 0)) - DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block)); + DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); } /* make one obj and return it. */ @@ -546,7 +566,14 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, if (head->block >= AMDGPU_RAS_BLOCK_COUNT) return NULL; - obj = &con->objs[head->block]; + if (head->block == AMDGPU_RAS_BLOCK__MCA) { + if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) + return NULL; + + obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; + } else + obj = &con->objs[head->block]; + /* already exist. return obj? */ if (alive_obj(obj)) return NULL; @@ -574,19 +601,21 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, if (head->block >= AMDGPU_RAS_BLOCK_COUNT) return NULL; - obj = &con->objs[head->block]; + if (head->block == AMDGPU_RAS_BLOCK__MCA) { + if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) + return NULL; + + obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; + } else + obj = &con->objs[head->block]; - if (alive_obj(obj)) { - WARN_ON(head->block != obj->head.block); + if (alive_obj(obj)) return obj; - } } else { - for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { + for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { obj = &con->objs[i]; - if (alive_obj(obj)) { - WARN_ON(i != obj->head.block); + if (alive_obj(obj)) return obj; - } } } @@ -627,8 +656,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, */ if (!amdgpu_ras_is_feature_allowed(adev, head)) return 0; - if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) - return 0; if (enable) { if (!obj) { @@ -679,18 +706,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, /* Do not enable if it is not allowed. */ WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); - /* Are we alerady in that state we are going to set? */ - if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) { - ret = 0; - goto out; - } if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { dev_err(adev->dev, "ras %s %s failed %d\n", enable ? "enable":"disable", - ras_block_str(head->block), + get_ras_block_str(head), ret); goto out; } @@ -732,7 +754,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, if (!ret) dev_info(adev->dev, "RAS INFO: %s setup object\n", - ras_block_str(head->block)); + get_ras_block_str(head)); } } else { /* setup the object then issue a ras TA disable cmd.*/ @@ -782,17 +804,39 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, bool bypass) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - int ras_block_count = AMDGPU_RAS_BLOCK_COUNT; int i; - const enum amdgpu_ras_error_type default_ras_type = - AMDGPU_RAS_ERROR__NONE; + const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE; - for (i = 0; i < ras_block_count; i++) { + for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) { struct ras_common_if head = { .block = i, .type = default_ras_type, .sub_block_index = 0, }; + + if (i == AMDGPU_RAS_BLOCK__MCA) + continue; + + if (bypass) { + /* + * bypass psp. vbios enable ras for us. + * so just create the obj + */ + if (__amdgpu_ras_feature_enable(adev, &head, 1)) + break; + } else { + if (amdgpu_ras_feature_enable(adev, &head, 1)) + break; + } + } + + for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) { + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__MCA, + .type = default_ras_type, + .sub_block_index = i, + }; + if (bypass) { /* * bypass psp. vbios enable ras for us. @@ -810,6 +854,32 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, } /* feature ctl end */ + +void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_err_data *err_data) +{ + switch (ras_block->sub_block_index) { + case AMDGPU_RAS_MCA_BLOCK__MP0: + if (adev->mca.mp0.ras_funcs && + adev->mca.mp0.ras_funcs->query_ras_error_count) + adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data); + break; + case AMDGPU_RAS_MCA_BLOCK__MP1: + if (adev->mca.mp1.ras_funcs && + adev->mca.mp1.ras_funcs->query_ras_error_count) + adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data); + break; + case AMDGPU_RAS_MCA_BLOCK__MPIO: + if (adev->mca.mpio.ras_funcs && + adev->mca.mpio.ras_funcs->query_ras_error_count) + adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data); + break; + default: + break; + } +} + /* query/inject/cure begin */ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) @@ -873,6 +943,9 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->hdp.ras_funcs->query_ras_error_count) adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); break; + case AMDGPU_RAS_BLOCK__MCA: + amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data); + break; default: break; } @@ -894,13 +967,13 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->smuio.funcs->get_socket_id(adev), adev->smuio.funcs->get_die_id(adev), obj->err_data.ce_count, - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); } else { dev_info(adev->dev, "%ld correctable hardware errors " "detected in %s block, no user " "action is needed.\n", obj->err_data.ce_count, - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); } } if (err_data.ue_count) { @@ -913,12 +986,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->smuio.funcs->get_socket_id(adev), adev->smuio.funcs->get_die_id(adev), obj->err_data.ue_count, - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); } else { dev_info(adev->dev, "%ld uncorrectable hardware errors " "detected in %s block\n", obj->err_data.ue_count, - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); } } @@ -1028,9 +1101,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__SDMA: case AMDGPU_RAS_BLOCK__MMHUB: case AMDGPU_RAS_BLOCK__PCIE_BIF: - case AMDGPU_RAS_BLOCK__MP0: - case AMDGPU_RAS_BLOCK__MP1: - case AMDGPU_RAS_BLOCK__MPIO: + case AMDGPU_RAS_BLOCK__MCA: ret = psp_ras_trigger_error(&adev->psp, &block_info); break; case AMDGPU_RAS_BLOCK__XGMI_WAFL: @@ -1038,13 +1109,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, break; default: dev_info(adev->dev, "%s error injection is not supported yet\n", - ras_block_str(info->head.block)); + get_ras_block_str(&info->head)); ret = -EINVAL; } if (ret) dev_err(adev->dev, "ras inject %s failed %d\n", - ras_block_str(info->head.block), ret); + get_ras_block_str(&info->head), ret); return ret; } @@ -1387,7 +1458,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) if (amdgpu_ras_is_supported(adev, obj->head.block) && (obj->attr_inuse == 1)) { sprintf(fs_info.debugfs_name, "%s_err_inject", - ras_block_str(obj->head.block)); + get_ras_block_str(&obj->head)); fs_info.head = obj->head; amdgpu_ras_debugfs_create(adev, &fs_info, dir); } @@ -2185,7 +2256,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) return 0; con = kmalloc(sizeof(struct amdgpu_ras) + - sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT, + sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT + + sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT, GFP_KERNEL|__GFP_ZERO); if (!con) return -ENOMEM; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 1670467c2054..ec42e9873aaa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -49,15 +49,22 @@ enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__MP0, AMDGPU_RAS_BLOCK__MP1, AMDGPU_RAS_BLOCK__FUSE, - AMDGPU_RAS_BLOCK__MPIO, + AMDGPU_RAS_BLOCK__MCA, AMDGPU_RAS_BLOCK__LAST }; -extern const char *ras_block_string[]; +enum amdgpu_ras_mca_block { + AMDGPU_RAS_MCA_BLOCK__MP0 = 0, + AMDGPU_RAS_MCA_BLOCK__MP1, + AMDGPU_RAS_MCA_BLOCK__MPIO, + AMDGPU_RAS_MCA_BLOCK__IOHC, + + AMDGPU_RAS_MCA_BLOCK__LAST +}; -#define ras_block_str(i) (ras_block_string[i]) #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST +#define AMDGPU_RAS_MCA_BLOCK_COUNT AMDGPU_RAS_MCA_BLOCK__LAST #define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) enum amdgpu_ras_gfx_subblock { @@ -544,8 +551,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) { return TA_RAS_BLOCK__MP1; case AMDGPU_RAS_BLOCK__FUSE: return TA_RAS_BLOCK__FUSE; - case AMDGPU_RAS_BLOCK__MPIO: - return TA_RAS_BLOCK__MPIO; + case AMDGPU_RAS_BLOCK__MCA: + return TA_RAS_BLOCK__MCA; default: WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block); return TA_RAS_BLOCK__UMC; @@ -640,4 +647,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev); int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev); +const char *get_ras_block_str(struct ras_common_if *ras_block); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c index 058b65730a84..8f7107d392af 100644 --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c @@ -52,7 +52,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = { .ras_fini = mca_v3_0_mp0_ras_fini, .query_ras_error_count = mca_v3_0_mp0_query_ras_error_count, .query_ras_error_address = NULL, - .ras_block = AMDGPU_RAS_BLOCK__MP0, + .ras_block = AMDGPU_RAS_BLOCK__MCA, + .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP0, .sysfs_name = "mp0_err_count", }; @@ -79,7 +80,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = { .ras_fini = mca_v3_0_mp1_ras_fini, .query_ras_error_count = mca_v3_0_mp1_query_ras_error_count, .query_ras_error_address = NULL, - .ras_block = AMDGPU_RAS_BLOCK__MP1, + .ras_block = AMDGPU_RAS_BLOCK__MCA, + .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MP1, .sysfs_name = "mp1_err_count", }; @@ -106,7 +108,8 @@ const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = { .ras_fini = mca_v3_0_mpio_ras_fini, .query_ras_error_count = mca_v3_0_mpio_query_ras_error_count, .query_ras_error_address = NULL, - .ras_block = AMDGPU_RAS_BLOCK__MPIO, + .ras_block = AMDGPU_RAS_BLOCK__MCA, + .ras_sub_block = AMDGPU_RAS_MCA_BLOCK__MPIO, .sysfs_name = "mpio_err_count", }; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 0a9fc19b1be0..91b3afa946f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -387,13 +387,13 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device "errors detected in %s block, " "no user action is needed.\n", obj->err_data.ce_count, - ras_block_str(adev->nbio.ras_if->block)); + get_ras_block_str(adev->nbio.ras_if)); if (err_data.ue_count) dev_info(adev->dev, "%ld uncorrectable hardware " "errors detected in %s block\n", obj->err_data.ue_count, - ras_block_str(adev->nbio.ras_if->block)); + get_ras_block_str(adev->nbio.ras_if)); } dev_info(adev->dev, "RAS controller interrupt triggered " diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h index 532260fd64db..82d956d15b54 100644 --- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h +++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h @@ -73,10 +73,19 @@ enum ta_ras_block { TA_RAS_BLOCK__MP0, TA_RAS_BLOCK__MP1, TA_RAS_BLOCK__FUSE, - TA_RAS_BLOCK__MPIO, + TA_RAS_BLOCK__MCA, TA_NUM_BLOCK_MAX }; +enum ta_ras_mca_block +{ + TA_RAS_MCA_BLOCK__MP0 = 0, + TA_RAS_MCA_BLOCK__MP1 = 1, + TA_RAS_MCA_BLOCK__MPIO = 2, + TA_RAS_MCA_BLOCK__IOHC = 3, + TA_MCA_NUM_BLOCK_MAX +}; + enum ta_ras_error_type { TA_RAS_ERROR__NONE = 0, TA_RAS_ERROR__PARITY = 1, -- 2.11.0