From f5c7e7797060255dbc8160734ccc5ad6183c5e04 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Wed, 7 Sep 2022 16:07:42 +0800 Subject: [PATCH] drm/amdgpu: Adjust removal control flow for smu v13_0_2 Adjust removal control flow for smu v13_0_2: During amdgpu uninstallation, when removing the first device, the kernel needs to first send a mode1reset message to all gpu devices. Otherwise, smu initialization will fail the next time amdgpu is installed. V2: 1. Update commit comments. 2. Remove the global variable amdgpu_device_remove_cnt and add a variable to the structure amdgpu_hive_info. 3. Use hive to detect the first removed device instead of a global variable. V3: 1. Update commit comments. 2. Split a patch into multiple patches. 3. The current patch does: a. Add a work mode of AMDGPU_RESET_FOR_DEVICE_REMOVE into the existing gpu recover path, which make all devices in hive list only have HW reset but no resume (except the base IP). b. Call AMDGPU_RESET_FOR_DEVICE_REMOVE and AMDGPU_NEED_FULL_RESET mode of amdgpu_device_gpu_recover in amdgpu_pci_remove when removing the first device in hive list. c. When removing the first device, the IP blocks keyword function call sequence is as follows: .suspend->mode1reset->.resume(basic ip)->.hw_fini->.early_fini->.sw_fini. ^ | |-<----------<---------<----| The first three sequences are because of a call to amdgpu_device_gpu_recover. The three sequences will be executed in a loop until all devices in the hive list are iterated. The sequences starting from .hw_fini only apply to the first device. Since .suspend has been called before, except the resumed phase1 basic ip blocks, all other ip blocks .hw_fini of current device will do nothing. d. When removing other devices, the calling sequences is the same as legacy: .hw_fini -> .early_fini -> .sw_fini. Since .suspend has been called when removing the first device, except the resumed phase1 basic ip blocks, all of other ip blocks .hw_fini of current device will do nothing. Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 30 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 30 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 + 4 files changed, 62 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 47aac179951a..c268bd033064 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4749,6 +4749,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_device *tmp_adev = NULL; bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; + bool gpu_reset_for_dev_remove = 0; /* Try reset handler method first */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, @@ -4768,6 +4769,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); + gpu_reset_for_dev_remove = + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) @@ -4812,6 +4817,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, amdgpu_ras_intr_cleared(); } + /* Since the mode1 reset affects base ip blocks, the + * phase1 ip blocks need to be resumed. Otherwise there + * will be a BIOS signature error and the psp bootloader + * can't load kdb on the next amdgpu install. + */ + if (gpu_reset_for_dev_remove) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) + amdgpu_device_ip_resume_phase1(tmp_adev); + + goto end; + } + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ @@ -5134,6 +5151,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, bool need_emergency_restart = false; bool audio_suspended = false; int tmp_vram_lost_counter; + bool gpu_reset_for_dev_remove = false; + + gpu_reset_for_dev_remove = + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); /* * Special case: RAS triggered and full reset isn't supported @@ -5253,6 +5275,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + if (gpu_reset_for_dev_remove) { + /* Workaroud for ASICs need to disable SMC first */ + amdgpu_device_smu_fini_early(tmp_adev); + } r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -5286,6 +5312,9 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ adev->asic_reset_res = 0; goto retry; } + + if (!r && gpu_reset_for_dev_remove) + goto recover_end; } skip_hw_reset: @@ -5359,6 +5388,7 @@ skip_sched_resume: amdgpu_device_unset_mp1_state(tmp_adev); } +recover_end: tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 728a0933ea6f..2e16210bebaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2186,6 +2186,36 @@ amdgpu_pci_remove(struct pci_dev *pdev) pm_runtime_forbid(dev->dev); } + if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) { + bool need_to_reset_gpu = false; + + if (adev->gmc.xgmi.num_physical_nodes > 1) { + struct amdgpu_hive_info *hive; + + hive = amdgpu_get_xgmi_hive(adev); + if (hive->device_remove_count == 0) + need_to_reset_gpu = true; + hive->device_remove_count++; + amdgpu_put_xgmi_hive(hive); + } else { + need_to_reset_gpu = true; + } + + /* Workaround for ASICs need to reset SMU. + * Called only when the first device is removed. + */ + if (need_to_reset_gpu) { + struct amdgpu_reset_context reset_context; + + memset(&reset_context, 0, sizeof(reset_context)); + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags); + amdgpu_device_gpu_recover(adev, NULL, &reset_context); + } + } + amdgpu_driver_unload_kms(dev); drm_dev_unplug(dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index f71b83c42590..dc43fcb93eac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -31,6 +31,7 @@ enum AMDGPU_RESET_FLAGS { AMDGPU_NEED_FULL_RESET = 0, AMDGPU_SKIP_HW_RESET = 1, AMDGPU_SKIP_MODE2_RESET = 2, + AMDGPU_RESET_FOR_DEVICE_REMOVE = 3, }; struct amdgpu_reset_context { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 552e6fb55aa8..30dcc1681b4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -43,6 +43,7 @@ struct amdgpu_hive_info { } pstate; struct amdgpu_reset_domain *reset_domain; + uint32_t device_remove_count; }; struct amdgpu_pcs_ras_field { -- 2.11.0