From c7490949239646c61db869014fcc74ed2cb91d53 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 23 Sep 2021 14:11:22 +0800 Subject: [PATCH] amd/amdkfd: add ras page retirement handling for sq/sdma (v3) In ras poison mode, page retirement will be handled by the irq handler of the module which consumes corrupted data. v2: rename ras_process_cb to ras_poison_consumption_handler. move the handler's implementation from ASIC specific file to common file. v3: call gpu reset for xGMI connected mode. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 14 ++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 4 ++-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1d41c2c00623..7505f1b9d3f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -31,6 +31,8 @@ #include #include "amdgpu_xgmi.h" #include +#include "amdgpu_ras.h" +#include "amdgpu_umc.h" /* Total memory size in system memory and all GPU VRAM. Used to * estimate worst case amount of memory to reserve for page tables @@ -780,3 +782,15 @@ bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) return adev->have_atomics_support; } + +void amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + struct ras_err_data err_data = {0, 0, 0, NULL}; + + /* CPU MCA will handle page retirement if connected_to_cpu is 1 */ + if (!adev->gmc.xgmi.connected_to_cpu) + amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL); + else + amdgpu_amdkfd_gpu_reset(kgd); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 3bc52b2c604f..7db37e2016df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -290,6 +290,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, uint64_t *mmap_offset); int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, struct tile_config *config); +void amdgpu_amdkfd_ras_poison_consumption_handler(struct kgd_dev *kgd); #if IS_ENABLED(CONFIG_HSA_AMD) void amdgpu_amdkfd_gpuvm_init_mem_limits(void); void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 12d91e53556c..543e7ea75593 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -231,7 +231,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { kfd_signal_poison_consumed_event(dev, pasid); - amdgpu_amdkfd_gpu_reset(dev->kgd); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd); return; } break; @@ -253,7 +253,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); } else if (source_id == SOC15_INTSRC_SDMA_ECC) { kfd_signal_poison_consumed_event(dev, pasid); - amdgpu_amdkfd_gpu_reset(dev->kgd); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd); return; } } else if (client_id == SOC15_IH_CLIENTID_VMC || -- 2.11.0