From df9c8d1aa278c435c30a69b8f2418b4a52fcb929 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 8 Jul 2020 15:07:13 +0800 Subject: [PATCH] drm/amdgpu: fix system hang issue during GPU reset MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover, the atomic adev->in_gpu_reset and hive->in_reset are used to avoid re-entering GPU recovery. During GPU reset and resume, it is unsafe that other threads access GPU, which maybe cause GPU reset failed. Therefore the new rw_semaphore adev->reset_sem is introduced, which protect GPU from being accessed by external threads during recovery. v2: 1. add rwlock for some ioctls, debugfs and file-close function. 2. change to use dqm->is_resetting and dqm_lock for protection in kfd driver. 3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid re-enter GPU recovery for the same GPU hang. v3: 1. change back to use adev->reset_sem to protect kfd callback functions, because dqm_lock couldn't protect all codes, for example: free_mqd must be called outside of dqm_lock; [ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019 [ 1230.177221] Call Trace: [ 1230.178249] dump_stack+0x98/0xd5 [ 1230.179443] amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu] [ 1230.180673] gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu] [ 1230.181882] amdgpu_gart_unbind+0xa9/0xe0 [amdgpu] [ 1230.183098] amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu] [ 1230.184239] ? ttm_bo_put+0x171/0x5f0 [ttm] [ 1230.185394] ttm_tt_unbind+0x21/0x40 [ttm] [ 1230.186558] ttm_tt_destroy.part.12+0x12/0x60 [ttm] [ 1230.187707] ttm_tt_destroy+0x13/0x20 [ttm] [ 1230.188832] ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm] [ 1230.189979] ttm_bo_put+0x1be/0x5f0 [ttm] [ 1230.191230] amdgpu_bo_unref+0x1e/0x30 [amdgpu] [ 1230.192522] amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu] [ 1230.193833] free_mqd+0x25/0x40 [amdgpu] [ 1230.195143] destroy_queue_cpsch+0x1a7/0x270 [amdgpu] [ 1230.196475] pqm_destroy_queue+0x105/0x260 [amdgpu] [ 1230.197819] kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu] [ 1230.199154] kfd_ioctl+0x277/0x500 [amdgpu] [ 1230.200458] ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu] [ 1230.201656] ? tomoyo_file_ioctl+0x19/0x20 [ 1230.202831] ksys_ioctl+0x98/0xb0 [ 1230.204004] __x64_sys_ioctl+0x1a/0x20 [ 1230.205174] do_syscall_64+0x5f/0x250 [ 1230.206339] entry_SYSCALL_64_after_hwframe+0x49/0xbe 2. remove try_lock and introduce atomic hive->in_reset, to avoid re-enter GPU recovery. v4: 1. remove an unnecessary whitespace change in kfd_chardev.c 2. remove comment codes in amdgpu_device.c 3. add more detailed comment in commit message 4. define a wrap function amdgpu_in_reset v5: 1. Fix some style issues. Reviewed-by: Hawking Zhang Suggested-by: Andrey Grodzovsky Suggested-by: Christian König Suggested-by: Felix Kuehling Suggested-by: Lijo Lazar Suggested-by: Luben Tukov Signed-off-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 40 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 + drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 + drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 4 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 53 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 4 + drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 4 + drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 350 +++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 11 +- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 3 +- drivers/gpu/drm/amd/amdgpu/atom.c | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 10 +- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 6 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 4 +- drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 +- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 13 +- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 13 +- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 16 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 + drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 +- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 2 +- drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 2 +- 39 files changed, 463 insertions(+), 183 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 327a0daf4a1d..58e39429395f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -961,9 +961,9 @@ struct amdgpu_device { bool in_suspend; bool in_hibernate; - bool in_gpu_reset; + atomic_t in_gpu_reset; enum pp_mp1_state mp1_state; - struct mutex lock_reset; + struct rw_semaphore reset_sem; struct amdgpu_doorbell_index doorbell_index; struct mutex notifier_lock; @@ -1278,4 +1278,9 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) return adev->gmc.tmz_enabled; } +static inline bool amdgpu_in_reset(struct amdgpu_device *adev) +{ + return atomic_read(&adev->in_gpu_reset) ? true : false; +} + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1b865fed74ca..a0ea663ecdbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, if (cp_mqd_gfx9) bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9; + if (!down_read_trylock(&adev->reset_sem)) + return -EIO; + r = amdgpu_bo_create(adev, &bp, &bo); if (r) { dev_err(adev->dev, "failed to allocate BO for amdkfd (%d)\n", r); - return r; + goto err; } /* map the buffer */ @@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, amdgpu_bo_unreserve(bo); + up_read(&adev->reset_sem); return 0; allocate_mem_kmap_bo_failed: @@ -291,19 +295,25 @@ allocate_mem_pin_bo_failed: amdgpu_bo_unreserve(bo); allocate_mem_reserve_bo_failed: amdgpu_bo_unref(&bo); - +err: + up_read(&adev->reset_sem); return r; } void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; + down_read(&adev->reset_sem); + amdgpu_bo_reserve(bo, true); amdgpu_bo_kunmap(bo); amdgpu_bo_unpin(bo); amdgpu_bo_unreserve(bo); amdgpu_bo_unref(&(bo)); + + up_read(&adev->reset_sem); } int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size, @@ -335,9 +345,14 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size, void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj) { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj; + down_read(&adev->reset_sem); + amdgpu_bo_unref(&bo); + + up_read(&adev->reset_sem); } uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd, @@ -611,12 +626,19 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, /* This works for NO_HWS. TODO: need to handle without knowing VMID */ job->vmid = vmid; + if (!down_read_trylock(&adev->reset_sem)) { + ret = -EIO; + goto err_ib_sched; + } + ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); if (ret) { DRM_ERROR("amdgpu: failed to schedule IB.\n"); goto err_ib_sched; } + up_read(&adev->reset_sem); + ret = dma_fence_wait(f, false); err_ib_sched: @@ -647,6 +669,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + if (!down_read_trylock(&adev->reset_sem)) + return -EIO; + if (adev->family == AMDGPU_FAMILY_AI) { int i; @@ -656,6 +681,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0); } + up_read(&adev->reset_sem); + return 0; } @@ -664,11 +691,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid) struct amdgpu_device *adev = (struct amdgpu_device *)kgd; const uint32_t flush_type = 0; bool all_hub = false; + int ret = -EIO; if (adev->family == AMDGPU_FAMILY_AI) all_hub = true; - return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); + if (down_read_trylock(&adev->reset_sem)) { + ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, + pasid, flush_type, all_hub); + up_read(&adev->reset_sem); + } + + return ret; } bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index bf927f432506..b0dcc800251e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -542,7 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; #if 0 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 744366c7ee85..275f20399373 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -423,7 +423,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, unsigned long flags, end_jiffies; int retry; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index feab4cc6e836..4997189d8b36 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -419,7 +419,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, int retry; struct vi_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index c7fd0c47b254..73728181b88f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -539,7 +539,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v9_mqd *m = get_mqd(mqd); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; acquire_queue(kgd, pipe_id, queue_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index e5a5ba869eb4..a21cf84b882c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1190,6 +1190,9 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( return -EINVAL; } + if (!down_read_trylock(&adev->reset_sem)) + return -EIO; + *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); if (!*mem) { ret = -ENOMEM; @@ -1256,6 +1259,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( if (offset) *offset = amdgpu_bo_mmap_offset(bo); + up_read(&adev->reset_sem); return 0; allocate_init_user_pages_failed: @@ -1273,6 +1277,9 @@ err: sg_free_table(sg); kfree(sg); } + + up_read(&adev->reset_sem); + return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a512ccbc4dea..a3b150304dae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1292,6 +1292,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) parser.adev = adev; parser.filp = filp; + down_read(&adev->reset_sem); + r = amdgpu_cs_parser_init(&parser, data); if (r) { DRM_ERROR("Failed to initialize parser %d!\n", r); @@ -1331,6 +1333,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) out: amdgpu_cs_parser_fini(&parser, r, reserved_buffers); + up_read(&adev->reset_sem); + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 8842c55d4490..d85d13f7a043 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -358,6 +358,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (atomic_read(&ctx->guilty)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; + down_read(&adev->reset_sem); + /*query ue count*/ ras_counter = amdgpu_ras_query_error_count(adev, false); /*ras counter is monotonic increasing*/ @@ -373,6 +375,8 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, ctx->ras_counter_ce = ras_counter; } + up_read(&adev->reset_sem); + mutex_unlock(&mgr->lock); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 193ffdb957b6..3a4b31b1c4f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -100,14 +100,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) file->private_data = adev; - mutex_lock(&adev->lock_reset); + down_read(&adev->reset_sem); if (adev->autodump.dumping.done) { reinit_completion(&adev->autodump.dumping); ret = 0; } else { ret = -EBUSY; } - mutex_unlock(&adev->lock_reset); + up_read(&adev->reset_sem); return ret; } @@ -126,7 +126,7 @@ static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_ poll_wait(file, &adev->autodump.gpu_hang, poll_table); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return POLLIN | POLLRDNORM | POLLWRNORM; return 0; @@ -1241,7 +1241,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) } /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(&adev->lock_reset); + down_read(&adev->reset_sem); /* hold on the scheduler */ for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1268,7 +1268,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) kthread_unpark(ring->sched.thread); } - mutex_unlock(&adev->lock_reset); + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); @@ -1458,7 +1458,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) return -ENOMEM; /* Avoid accidently unparking the sched thread during GPU reset */ - mutex_lock(&adev->lock_reset); + down_read(&adev->reset_sem); /* stop the scheduler */ kthread_park(ring->sched.thread); @@ -1499,7 +1499,7 @@ failure: /* restart the scheduler */ kthread_unpark(ring->sched.thread); - mutex_unlock(&adev->lock_reset); + up_read(&adev->reset_sem); ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index aa5b54e5a1d7..62ecac97fbd2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1935,7 +1935,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].status.hw == true) break; - if (adev->in_gpu_reset || adev->in_suspend) { + if (amdgpu_in_reset(adev) || adev->in_suspend) { r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { DRM_ERROR("resume of IP block <%s> failed %d\n", @@ -2106,7 +2106,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) AMDGPU_RESET_MAGIC_NUM)) return true; - if (!adev->in_gpu_reset) + if (!amdgpu_in_reset(adev)) return false; /* @@ -3036,7 +3036,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->mn_lock); mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); - mutex_init(&adev->lock_reset); + init_rwsem(&adev->reset_sem); + atomic_set(&adev->in_gpu_reset, 0); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); @@ -4064,8 +4065,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { /* post card */ - if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) - DRM_WARN("asic atom init failed!"); + if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) { + dev_warn(tmp_adev->dev, "asic atom init failed!"); + r = -EAGAIN; + goto out; + } if (!r) { dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); @@ -4141,16 +4145,14 @@ end: return r; } -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) { - if (trylock) { - if (!mutex_trylock(&adev->lock_reset)) - return false; - } else - mutex_lock(&adev->lock_reset); + if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) + return false; + + down_write(&adev->reset_sem); atomic_inc(&adev->gpu_reset_counter); - adev->in_gpu_reset = true; switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: adev->mp1_state = PP_MP1_STATE_SHUTDOWN; @@ -4170,8 +4172,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; - adev->in_gpu_reset = false; - mutex_unlock(&adev->lock_reset); + atomic_set(&adev->in_gpu_reset, 0); + up_write(&adev->reset_sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) @@ -4281,12 +4283,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev, true); - if (hive && !mutex_trylock(&hive->reset_lock)) { - DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", - job ? job->base.id : -1, hive->hive_id); - mutex_unlock(&hive->hive_lock); - return 0; + hive = amdgpu_get_xgmi_hive(adev, false); + if (hive) { + if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { + DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", + job ? job->base.id : -1, hive->hive_id); + return 0; + } + mutex_lock(&hive->hive_lock); } /* @@ -4308,11 +4312,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { + if (!amdgpu_device_lock_adev(tmp_adev)) { DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", job ? job->base.id : -1); - mutex_unlock(&hive->hive_lock); - return 0; + r = 0; + goto skip_recovery; } /* @@ -4445,8 +4449,9 @@ skip_sched_resume: amdgpu_device_unlock_adev(tmp_adev); } +skip_recovery: if (hive) { - mutex_unlock(&hive->reset_lock); + atomic_set(&hive->in_reset, 0); mutex_unlock(&hive->hive_lock); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 7f9e50247413..73cc68ab53d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -671,6 +671,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, bo_va = NULL; } + down_read(&adev->reset_sem); + switch (args->operation) { case AMDGPU_VA_OP_MAP: va_flags = amdgpu_gem_va_map_flags(adev, args->flags); @@ -700,6 +702,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va, args->operation); + up_read(&adev->reset_sem); + error_backoff: ttm_eu_backoff_reservation(&ticket, &list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 78d37f92c7be..8eff0173360d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -724,7 +724,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) * * also don't wait anymore for IRQ context * */ - if (r < 1 && (adev->in_gpu_reset || in_interrupt())) + if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt())) goto failed_kiq_read; might_sleep(); @@ -782,7 +782,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) * * also don't wait anymore for IRQ context * */ - if (r < 1 && (adev->in_gpu_reset || in_interrupt())) + if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt())) goto failed_kiq_write; might_sleep(); @@ -801,5 +801,5 @@ failed_undo: amdgpu_ring_undo(ring); spin_unlock_irqrestore(&kiq->ring_lock, flags); failed_kiq_write: - pr_err("failed to write reg:%x\n", reg); + dev_warn(adev->dev, "failed to write reg:%x\n", reg); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 937029ad5271..75d37dfb51aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -220,17 +220,17 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) trace_amdgpu_sched_run_job(job); - if (job->vram_lost_counter != atomic_read(&ring->adev->vram_lost_counter)) - dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */ - - if (finished->error < 0) { - DRM_INFO("Skip scheduling IBs!\n"); - } else { + if (down_read_trylock(&ring->adev->reset_sem)) { r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, - &fence); + &fence); + up_read(&ring->adev->reset_sem); if (r) DRM_ERROR("Error scheduling IBs (%d)\n", r); + } else { + dma_fence_set_error(finished, -ECANCELED); + DRM_INFO("Skip scheduling IBs!\n"); } + /* if gpu reset, hw fence will be replaced here */ dma_fence_put(job->fence); job->fence = dma_fence_get(fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index a8c47aecd342..b9c1fce6da79 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1084,6 +1084,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, if (!fpriv) return; + down_read(&adev->reset_sem); + pm_runtime_get_sync(dev->dev); if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_UVD) != NULL) @@ -1122,6 +1124,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); + + up_read(&adev->reset_sem); } /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c index 0a05db9b7132..2f7b0550ff16 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c @@ -163,7 +163,7 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev, enum amd_pm_state_type pm; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -172,6 +172,8 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { if (adev->smu.ppt_funcs->get_current_power_state) pm = smu_get_current_power_state(&adev->smu); @@ -183,6 +185,8 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev, pm = adev->pm.dpm.user_state; } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -201,7 +205,7 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev, enum amd_pm_state_type state; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (strncmp("battery", buf, strlen("battery")) == 0) @@ -219,6 +223,8 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { mutex_lock(&adev->pm.mutex); adev->pm.dpm.user_state = state; @@ -232,6 +238,9 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev, amdgpu_pm_compute_clocks(adev); } + + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -307,7 +316,7 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev, enum amd_dpm_forced_level level = 0xff; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -316,6 +325,8 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) level = smu_get_performance_level(&adev->smu); else if (adev->powerplay.pp_funcs->get_performance_level) @@ -323,6 +334,8 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev, else level = adev->pm.dpm.forced_level; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -349,7 +362,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev, enum amd_dpm_forced_level current_level = 0xff; int ret = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (strncmp("low", buf, strlen("low")) == 0) { @@ -380,6 +393,8 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) current_level = smu_get_performance_level(&adev->smu); else if (adev->powerplay.pp_funcs->get_performance_level) @@ -388,7 +403,8 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev, if (current_level == level) { pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return count; + ret = count; + goto pro_end; } if (adev->asic_type == CHIP_RAVEN) { @@ -409,7 +425,8 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev, pr_err("Currently not in any profile mode!\n"); pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return -EINVAL; + ret = -EINVAL; + goto pro_end; } if (is_support_sw_smu(adev)) { @@ -417,7 +434,8 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev, if (ret) { pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return -EINVAL; + ret = -EINVAL; + goto pro_end; } } else if (adev->powerplay.pp_funcs->force_performance_level) { mutex_lock(&adev->pm.mutex); @@ -425,14 +443,16 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev, mutex_unlock(&adev->pm.mutex); pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return -EINVAL; + ret = -EINVAL; + goto pro_end; } ret = amdgpu_dpm_force_performance_level(adev, level); if (ret) { mutex_unlock(&adev->pm.mutex); pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return -EINVAL; + ret = -EINVAL; + goto pro_end; } else { adev->pm.dpm.forced_level = level; } @@ -441,7 +461,9 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev, pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return count; +pro_end: + up_read(&adev->reset_sem); + return ret; } static ssize_t amdgpu_get_pp_num_states(struct device *dev, @@ -453,7 +475,7 @@ static ssize_t amdgpu_get_pp_num_states(struct device *dev, struct pp_states_info data; int i, buf_len, ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -497,7 +519,7 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev, enum amd_pm_state_type pm = 0; int i = 0, ret = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -538,7 +560,7 @@ static ssize_t amdgpu_get_pp_force_state(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = ddev->dev_private; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (adev->pp_force_state_enabled) @@ -558,7 +580,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev, unsigned long idx; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (strlen(buf) == 1) @@ -584,6 +606,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev, return ret; } + down_read(&adev->reset_sem); /* only set user selected power states */ if (state != POWER_STATE_TYPE_INTERNAL_BOOT && state != POWER_STATE_TYPE_DEFAULT) { @@ -591,6 +614,8 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev, AMD_PP_TASK_ENABLE_USER_STATE, &state); adev->pp_force_state_enabled = true; } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); } @@ -618,7 +643,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev, char *table = NULL; int size, ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -662,7 +687,7 @@ static ssize_t amdgpu_set_pp_table(struct device *dev, struct amdgpu_device *adev = ddev->dev_private; int ret = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -671,16 +696,21 @@ static ssize_t amdgpu_set_pp_table(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { ret = smu_sys_set_pp_table(&adev->smu, (void *)buf, count); if (ret) { pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); + up_read(&adev->reset_sem); return ret; } } else if (adev->powerplay.pp_funcs->set_pp_table) amdgpu_dpm_set_pp_table(adev, buf, count); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -815,7 +845,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev, const char delimiter[3] = {' ', '\n', '\0'}; uint32_t type; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (count > 127) @@ -858,6 +888,10 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev, return ret; } + ret = count; + + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { ret = smu_od_edit_dpm_table(&adev->smu, type, parameter, parameter_size); @@ -865,7 +899,8 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev, if (ret) { pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return -EINVAL; + ret = -EINVAL; + goto pro_end; } } else { if (adev->powerplay.pp_funcs->odn_edit_dpm_table) { @@ -874,7 +909,8 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev, if (ret) { pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return -EINVAL; + ret = -EINVAL; + goto pro_end; } } @@ -885,18 +921,22 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev, NULL); pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return count; + ret = count; + goto pro_end; } else { pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return -EINVAL; + ret = -EINVAL; + goto pro_end; } } } pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); - return count; +pro_end: + up_read(&adev->reset_sem); + return ret; } static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev, @@ -908,7 +948,7 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -962,7 +1002,7 @@ static ssize_t amdgpu_set_pp_features(struct device *dev, uint64_t featuremask; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = kstrtou64(buf, 0, &featuremask); @@ -977,11 +1017,13 @@ static ssize_t amdgpu_set_pp_features(struct device *dev, return ret; } + down_read(&adev->reset_sem); if (is_support_sw_smu(adev)) { ret = smu_sys_set_pp_feature_mask(&adev->smu, featuremask); if (ret) { pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); + up_read(&adev->reset_sem); return -EINVAL; } } else if (adev->powerplay.pp_funcs->set_ppfeature_status) { @@ -989,9 +1031,12 @@ static ssize_t amdgpu_set_pp_features(struct device *dev, if (ret) { pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); + up_read(&adev->reset_sem); return -EINVAL; } } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1007,7 +1052,7 @@ static ssize_t amdgpu_get_pp_features(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1016,6 +1061,8 @@ static ssize_t amdgpu_get_pp_features(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) size = smu_sys_get_pp_feature_mask(&adev->smu, buf); else if (adev->powerplay.pp_funcs->get_ppfeature_status) @@ -1023,6 +1070,8 @@ static ssize_t amdgpu_get_pp_features(struct device *dev, else size = snprintf(buf, PAGE_SIZE, "\n"); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1068,7 +1117,7 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1077,6 +1126,8 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) size = smu_print_clk_levels(&adev->smu, SMU_SCLK, buf); else if (adev->powerplay.pp_funcs->print_clock_levels) @@ -1084,6 +1135,8 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev, else size = snprintf(buf, PAGE_SIZE, "\n"); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1135,7 +1188,7 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev, int ret; uint32_t mask = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = amdgpu_read_mask(buf, count, &mask); @@ -1148,11 +1201,15 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) ret = smu_force_clk_levels(&adev->smu, SMU_SCLK, mask); else if (adev->powerplay.pp_funcs->force_clock_level) ret = amdgpu_dpm_force_clock_level(adev, PP_SCLK, mask); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1171,7 +1228,7 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1180,6 +1237,8 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) size = smu_print_clk_levels(&adev->smu, SMU_MCLK, buf); else if (adev->powerplay.pp_funcs->print_clock_levels) @@ -1187,6 +1246,8 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev, else size = snprintf(buf, PAGE_SIZE, "\n"); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1203,7 +1264,7 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev, uint32_t mask = 0; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = amdgpu_read_mask(buf, count, &mask); @@ -1216,11 +1277,15 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) ret = smu_force_clk_levels(&adev->smu, SMU_MCLK, mask); else if (adev->powerplay.pp_funcs->force_clock_level) ret = amdgpu_dpm_force_clock_level(adev, PP_MCLK, mask); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1239,7 +1304,7 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1248,6 +1313,8 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) size = smu_print_clk_levels(&adev->smu, SMU_SOCCLK, buf); else if (adev->powerplay.pp_funcs->print_clock_levels) @@ -1255,6 +1322,8 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev, else size = snprintf(buf, PAGE_SIZE, "\n"); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1271,7 +1340,7 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev, int ret; uint32_t mask = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = amdgpu_read_mask(buf, count, &mask); @@ -1284,6 +1353,8 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) ret = smu_force_clk_levels(&adev->smu, SMU_SOCCLK, mask); else if (adev->powerplay.pp_funcs->force_clock_level) @@ -1291,6 +1362,8 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev, else ret = 0; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1309,7 +1382,7 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1318,6 +1391,8 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) size = smu_print_clk_levels(&adev->smu, SMU_FCLK, buf); else if (adev->powerplay.pp_funcs->print_clock_levels) @@ -1325,6 +1400,8 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev, else size = snprintf(buf, PAGE_SIZE, "\n"); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1341,7 +1418,7 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev, int ret; uint32_t mask = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = amdgpu_read_mask(buf, count, &mask); @@ -1354,6 +1431,8 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) ret = smu_force_clk_levels(&adev->smu, SMU_FCLK, mask); else if (adev->powerplay.pp_funcs->force_clock_level) @@ -1361,6 +1440,8 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev, else ret = 0; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1379,7 +1460,7 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1388,6 +1469,8 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) size = smu_print_clk_levels(&adev->smu, SMU_DCEFCLK, buf); else if (adev->powerplay.pp_funcs->print_clock_levels) @@ -1395,6 +1478,8 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev, else size = snprintf(buf, PAGE_SIZE, "\n"); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1411,7 +1496,7 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev, int ret; uint32_t mask = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = amdgpu_read_mask(buf, count, &mask); @@ -1424,6 +1509,8 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) ret = smu_force_clk_levels(&adev->smu, SMU_DCEFCLK, mask); else if (adev->powerplay.pp_funcs->force_clock_level) @@ -1431,6 +1518,8 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev, else ret = 0; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1449,7 +1538,7 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1458,6 +1547,8 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) size = smu_print_clk_levels(&adev->smu, SMU_PCIE, buf); else if (adev->powerplay.pp_funcs->print_clock_levels) @@ -1465,6 +1556,8 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev, else size = snprintf(buf, PAGE_SIZE, "\n"); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1481,7 +1574,7 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev, int ret; uint32_t mask = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = amdgpu_read_mask(buf, count, &mask); @@ -1494,6 +1587,8 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) ret = smu_force_clk_levels(&adev->smu, SMU_PCIE, mask); else if (adev->powerplay.pp_funcs->force_clock_level) @@ -1501,6 +1596,8 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev, else ret = 0; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1519,7 +1616,7 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev, uint32_t value = 0; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1528,11 +1625,15 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) value = smu_get_od_percentage(&(adev->smu), SMU_OD_SCLK); else if (adev->powerplay.pp_funcs->get_sclk_od) value = amdgpu_dpm_get_sclk_od(adev); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1549,7 +1650,7 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev, int ret; long int value; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = kstrtol(buf, 0, &value); @@ -1563,6 +1664,8 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { value = smu_set_od_percentage(&(adev->smu), SMU_OD_SCLK, (uint32_t)value); } else { @@ -1577,6 +1680,8 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev, } } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1592,7 +1697,7 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev, uint32_t value = 0; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1601,11 +1706,15 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) value = smu_get_od_percentage(&(adev->smu), SMU_OD_MCLK); else if (adev->powerplay.pp_funcs->get_mclk_od) value = amdgpu_dpm_get_mclk_od(adev); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1622,7 +1731,7 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev, int ret; long int value; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = kstrtol(buf, 0, &value); @@ -1636,6 +1745,8 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { value = smu_set_od_percentage(&(adev->smu), SMU_OD_MCLK, (uint32_t)value); } else { @@ -1650,6 +1761,8 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev, } } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1685,7 +1798,7 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev, ssize_t size; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(ddev->dev); @@ -1694,6 +1807,8 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) size = smu_get_power_profile_mode(&adev->smu, buf); else if (adev->powerplay.pp_funcs->get_power_profile_mode) @@ -1701,6 +1816,8 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev, else size = snprintf(buf, PAGE_SIZE, "\n"); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1725,7 +1842,7 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev, long int profile_mode = 0; const char delimiter[3] = {' ', '\n', '\0'}; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; tmp[0] = *(buf); @@ -1758,11 +1875,15 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) ret = smu_set_power_profile_mode(&adev->smu, parameter, parameter_size, true); else if (adev->powerplay.pp_funcs->set_power_profile_mode) ret = amdgpu_dpm_set_power_profile_mode(adev, parameter, parameter_size); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1788,7 +1909,7 @@ static ssize_t amdgpu_get_gpu_busy_percent(struct device *dev, struct amdgpu_device *adev = ddev->dev_private; int r, value, size = sizeof(value); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(ddev->dev); @@ -1797,9 +1918,11 @@ static ssize_t amdgpu_get_gpu_busy_percent(struct device *dev, return r; } + down_read(&adev->reset_sem); /* read the IP busy sensor */ r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_LOAD, (void *)&value, &size); + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1826,7 +1949,7 @@ static ssize_t amdgpu_get_mem_busy_percent(struct device *dev, struct amdgpu_device *adev = ddev->dev_private; int r, value, size = sizeof(value); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(ddev->dev); @@ -1835,10 +1958,14 @@ static ssize_t amdgpu_get_mem_busy_percent(struct device *dev, return r; } + down_read(&adev->reset_sem); + /* read the IP busy sensor */ r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MEM_LOAD, (void *)&value, &size); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1869,7 +1996,7 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev, uint64_t count0 = 0, count1 = 0; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (adev->flags & AMD_IS_APU) @@ -1884,8 +2011,12 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev, return ret; } + down_read(&adev->reset_sem); + amdgpu_asic_get_pcie_usage(adev, &count0, &count1); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(ddev->dev); pm_runtime_put_autosuspend(ddev->dev); @@ -1910,7 +2041,7 @@ static ssize_t amdgpu_get_unique_id(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = ddev->dev_private; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (adev->unique_id) @@ -2177,7 +2308,7 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev, int channel = to_sensor_dev_attr(attr)->index; int r, temp = 0, size = sizeof(temp); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (channel >= PP_TEMP_MAX) @@ -2189,6 +2320,8 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev, return r; } + down_read(&adev->reset_sem); + switch (channel) { case PP_TEMP_JUNCTION: /* get current junction temperature */ @@ -2210,6 +2343,8 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev, break; } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2313,7 +2448,7 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev, u32 pwm_mode = 0; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(adev->ddev->dev); @@ -2322,18 +2457,23 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { pwm_mode = smu_get_fan_control_mode(&adev->smu); } else { if (!adev->powerplay.pp_funcs->get_fan_control_mode) { pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); + up_read(&adev->reset_sem); return -EINVAL; } pwm_mode = amdgpu_dpm_get_fan_control_mode(adev); } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2349,7 +2489,7 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev, int err, ret; int value; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; err = kstrtoint(buf, 10, &value); @@ -2362,18 +2502,23 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { smu_set_fan_control_mode(&adev->smu, value); } else { if (!adev->powerplay.pp_funcs->set_fan_control_mode) { pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); + up_read(&adev->reset_sem); return -EINVAL; } amdgpu_dpm_set_fan_control_mode(adev, value); } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2403,7 +2548,7 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev, u32 value; u32 pwm_mode; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; err = pm_runtime_get_sync(adev->ddev->dev); @@ -2412,11 +2557,15 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev, return err; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) pwm_mode = smu_get_fan_control_mode(&adev->smu); else pwm_mode = amdgpu_dpm_get_fan_control_mode(adev); + up_read(&adev->reset_sem); + if (pwm_mode != AMD_FAN_CTRL_MANUAL) { pr_info("manual fan speed control should be enabled first\n"); pm_runtime_mark_last_busy(adev->ddev->dev); @@ -2457,7 +2606,7 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev, int err; u32 speed = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; err = pm_runtime_get_sync(adev->ddev->dev); @@ -2466,6 +2615,8 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev, return err; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) err = smu_get_fan_speed_percent(&adev->smu, &speed); else if (adev->powerplay.pp_funcs->get_fan_speed_percent) @@ -2473,6 +2624,8 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev, else err = -EINVAL; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2492,7 +2645,7 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev, int err; u32 speed = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; err = pm_runtime_get_sync(adev->ddev->dev); @@ -2501,6 +2654,8 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev, return err; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) err = smu_get_fan_speed_rpm(&adev->smu, &speed); else if (adev->powerplay.pp_funcs->get_fan_speed_rpm) @@ -2508,6 +2663,8 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev, else err = -EINVAL; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2526,7 +2683,7 @@ static ssize_t amdgpu_hwmon_get_fan1_min(struct device *dev, u32 size = sizeof(min_rpm); int r; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(adev->ddev->dev); @@ -2535,9 +2692,13 @@ static ssize_t amdgpu_hwmon_get_fan1_min(struct device *dev, return r; } + down_read(&adev->reset_sem); + r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MIN_FAN_RPM, (void *)&min_rpm, &size); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2556,7 +2717,7 @@ static ssize_t amdgpu_hwmon_get_fan1_max(struct device *dev, u32 size = sizeof(max_rpm); int r; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(adev->ddev->dev); @@ -2565,9 +2726,13 @@ static ssize_t amdgpu_hwmon_get_fan1_max(struct device *dev, return r; } + down_read(&adev->reset_sem); + r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MAX_FAN_RPM, (void *)&max_rpm, &size); + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2585,7 +2750,7 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev, int err; u32 rpm = 0; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; err = pm_runtime_get_sync(adev->ddev->dev); @@ -2594,6 +2759,8 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev, return err; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) err = smu_get_fan_speed_rpm(&adev->smu, &rpm); else if (adev->powerplay.pp_funcs->get_fan_speed_rpm) @@ -2601,6 +2768,8 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev, else err = -EINVAL; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2619,7 +2788,7 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev, u32 value; u32 pwm_mode; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; err = pm_runtime_get_sync(adev->ddev->dev); @@ -2628,11 +2797,15 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev, return err; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) pwm_mode = smu_get_fan_control_mode(&adev->smu); else pwm_mode = amdgpu_dpm_get_fan_control_mode(adev); + up_read(&adev->reset_sem); + if (pwm_mode != AMD_FAN_CTRL_MANUAL) { pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2646,6 +2819,8 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev, return err; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) err = smu_set_fan_speed_rpm(&adev->smu, value); else if (adev->powerplay.pp_funcs->set_fan_speed_rpm) @@ -2653,6 +2828,8 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev, else err = -EINVAL; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2670,7 +2847,7 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev, u32 pwm_mode = 0; int ret; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; ret = pm_runtime_get_sync(adev->ddev->dev); @@ -2679,18 +2856,23 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev, return ret; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { pwm_mode = smu_get_fan_control_mode(&adev->smu); } else { if (!adev->powerplay.pp_funcs->get_fan_control_mode) { pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); + up_read(&adev->reset_sem); return -EINVAL; } pwm_mode = amdgpu_dpm_get_fan_control_mode(adev); } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2707,7 +2889,7 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev, int value; u32 pwm_mode; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; err = kstrtoint(buf, 10, &value); @@ -2727,17 +2909,22 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev, return err; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { smu_set_fan_control_mode(&adev->smu, pwm_mode); } else { if (!adev->powerplay.pp_funcs->set_fan_control_mode) { pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); + up_read(&adev->reset_sem); return -EINVAL; } amdgpu_dpm_set_fan_control_mode(adev, pwm_mode); } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2752,7 +2939,7 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev, u32 vddgfx; int r, size = sizeof(vddgfx); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(adev->ddev->dev); @@ -2761,9 +2948,11 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev, return r; } + down_read(&adev->reset_sem); /* get the voltage */ r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDGFX, (void *)&vddgfx, &size); + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2789,7 +2978,7 @@ static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev, u32 vddnb; int r, size = sizeof(vddnb); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; /* only APUs have vddnb */ @@ -2802,9 +2991,11 @@ static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev, return r; } + down_read(&adev->reset_sem); /* get the voltage */ r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDNB, (void *)&vddnb, &size); + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2831,7 +3022,7 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev, int r, size = sizeof(u32); unsigned uw; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(adev->ddev->dev); @@ -2840,9 +3031,11 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev, return r; } + down_read(&adev->reset_sem); /* get the voltage */ r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_POWER, (void *)&query, &size); + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2872,7 +3065,7 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev, ssize_t size; int r; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(adev->ddev->dev); @@ -2881,6 +3074,8 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev, return r; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { smu_get_power_limit(&adev->smu, &limit, true); size = snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000); @@ -2891,6 +3086,8 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev, size = snprintf(buf, PAGE_SIZE, "\n"); } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2906,7 +3103,7 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev, ssize_t size; int r; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(adev->ddev->dev); @@ -2915,6 +3112,8 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev, return r; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) { smu_get_power_limit(&adev->smu, &limit, false); size = snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000); @@ -2925,6 +3124,8 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev, size = snprintf(buf, PAGE_SIZE, "\n"); } + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2941,7 +3142,7 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev, int err; u32 value; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; if (amdgpu_sriov_vf(adev)) @@ -2960,6 +3161,8 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev, return err; } + down_read(&adev->reset_sem); + if (is_support_sw_smu(adev)) err = smu_set_power_limit(&adev->smu, value); else if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->set_power_limit) @@ -2967,6 +3170,8 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev, else err = -EINVAL; + up_read(&adev->reset_sem); + pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -2984,7 +3189,7 @@ static ssize_t amdgpu_hwmon_show_sclk(struct device *dev, uint32_t sclk; int r, size = sizeof(sclk); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(adev->ddev->dev); @@ -2993,9 +3198,11 @@ static ssize_t amdgpu_hwmon_show_sclk(struct device *dev, return r; } + down_read(&adev->reset_sem); /* get the sclk */ r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GFX_SCLK, (void *)&sclk, &size); + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -3021,7 +3228,7 @@ static ssize_t amdgpu_hwmon_show_mclk(struct device *dev, uint32_t mclk; int r, size = sizeof(mclk); - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(adev->ddev->dev); @@ -3030,9 +3237,11 @@ static ssize_t amdgpu_hwmon_show_mclk(struct device *dev, return r; } + down_read(&adev->reset_sem); /* get the sclk */ r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GFX_MCLK, (void *)&mclk, &size); + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(adev->ddev->dev); pm_runtime_put_autosuspend(adev->ddev->dev); @@ -3913,7 +4122,7 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data) u32 flags = 0; int r; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EPERM; r = pm_runtime_get_sync(dev->dev); @@ -3922,7 +4131,10 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data) return r; } + down_read(&adev->reset_sem); amdgpu_device_ip_get_clockgating_state(adev, &flags); + up_read(&adev->reset_sem); + seq_printf(m, "Clock Gating Flags Mask: 0x%x\n", flags); amdgpu_parse_cg_state(m, flags); seq_printf(m, "\n"); @@ -3934,6 +4146,7 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data) return 0; } + down_read(&adev->reset_sem); if (!is_support_sw_smu(adev) && adev->powerplay.pp_funcs->debugfs_print_current_performance_level) { mutex_lock(&adev->pm.mutex); @@ -3946,6 +4159,7 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data) } else { r = amdgpu_debugfs_pm_info_pp(m, adev); } + up_read(&adev->reset_sem); pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 8034111acd9a..a053b7af0680 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1684,7 +1684,7 @@ static int psp_load_smu_fw(struct psp_context *psp) return 0; - if (adev->in_gpu_reset && ras && ras->supported) { + if (amdgpu_in_reset(adev) && ras && ras->supported) { ret = amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_UNLOAD); if (ret) { DRM_WARN("Failed to set MP1 state prepare for reload\n"); @@ -1799,7 +1799,7 @@ static int psp_load_fw(struct amdgpu_device *adev) int ret; struct psp_context *psp = &adev->psp; - if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) { + if (amdgpu_sriov_vf(adev) && amdgpu_in_reset(adev)) { psp_ring_stop(psp, PSP_RING_TYPE__KM); /* should not destroy ring, only stop */ goto skip_memalloc; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bcce4c0be462..5680f7eafcb1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1978,7 +1978,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, amdgpu_ras_request_reset_on_boot(adev, ras_block->block); return 0; - } else if (adev->in_suspend || adev->in_gpu_reset) { + } else if (adev->in_suspend || amdgpu_in_reset(adev)) { /* in resume phase, if fail to enable ras, * clean up all ras fs nodes, and disable ras */ goto cleanup; @@ -1987,7 +1987,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, } /* in resume phase, no need to create ras fs node */ - if (adev->in_suspend || adev->in_gpu_reset) + if (adev->in_suspend || amdgpu_in_reset(adev)) return 0; if (ih_info->cb) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 5f2f3faad792..605d266754f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -2088,7 +2088,7 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable) uint64_t size; int r; - if (!adev->mman.initialized || adev->in_gpu_reset || + if (!adev->mman.initialized || amdgpu_in_reset(adev) || adev->mman.buffer_funcs_enabled == enable) return; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c index 183743c5fb7b..039245c98ff8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c @@ -628,7 +628,8 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev) struct amdgpu_firmware_info *ucode = NULL; /* for baremetal, the ucode is allocated in gtt, so don't need to fill the bo when reset/suspend */ - if (!amdgpu_sriov_vf(adev) && (adev->in_gpu_reset || adev->in_suspend)) + if (!amdgpu_sriov_vf(adev) && + (amdgpu_in_reset(adev) || adev->in_suspend)) return 0; /* * if SMU loaded firmware, it needn't add SMC, UVD, and VCE diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 1203c20491e6..5cae39d35c04 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -93,7 +93,7 @@ failed_undo: amdgpu_ring_undo(ring); spin_unlock_irqrestore(&kiq->ring_lock, flags); failed_kiq: - pr_err("failed to write reg %x wait reg %x\n", reg0, reg1); + dev_warn(adev->dev, "failed to write reg %x wait reg %x\n", reg0, reg1); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index f826945989c7..b2046c3a404d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -325,9 +325,9 @@ static inline bool is_virtual_machine(void) #define amdgpu_sriov_is_pp_one_vf(adev) \ ((adev)->virt.gim_feature & AMDGIM_FEATURE_PP_ONE_VF) #define amdgpu_sriov_is_debug(adev) \ - ((!adev->in_gpu_reset) && adev->virt.tdr_debug) + ((!amdgpu_in_reset(adev)) && adev->virt.tdr_debug) #define amdgpu_sriov_is_normal(adev) \ - ((!adev->in_gpu_reset) && (!adev->virt.tdr_debug)) + ((!amdgpu_in_reset(adev)) && (!adev->virt.tdr_debug)) bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev); void amdgpu_virt_init_setting(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index e3a3755cb999..4e017f379eb6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -372,7 +372,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo tmp->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(&tmp->device_list); mutex_init(&tmp->hive_lock); - mutex_init(&tmp->reset_lock); + atomic_set(&tmp->in_reset, 0); task_barrier_init(&tmp->tb); if (lock) @@ -397,6 +397,7 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) hive->hi_req_gpu : adev; bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; + bool locked; /* fw bug so temporarily disable pstate switching */ return 0; @@ -404,7 +405,9 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) if (!hive || adev->asic_type != CHIP_VEGA20) return 0; - mutex_lock(&hive->hive_lock); + locked = atomic_read(&hive->in_reset) ? false : true; + if (locked) + mutex_lock(&hive->hive_lock); if (is_hi_req) hive->hi_req_count++; @@ -439,7 +442,8 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) adev : NULL; } out: - mutex_unlock(&hive->hive_lock); + if (locked) + mutex_unlock(&hive->hive_lock); return ret; } @@ -594,7 +598,6 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) if(!(--hive->number_devices)){ amdgpu_xgmi_sysfs_destroy(adev, hive); mutex_destroy(&hive->hive_lock); - mutex_destroy(&hive->reset_lock); } return psp_xgmi_terminate(&adev->psp); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 6999eab16a72..61720cd4a1ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -30,7 +30,8 @@ struct amdgpu_hive_info { uint64_t hive_id; struct list_head device_list; int number_devices; - struct mutex hive_lock, reset_lock; + struct mutex hive_lock; + atomic_t in_reset; struct kobject *kobj; struct device_attribute dev_attr; struct amdgpu_device *adev; diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c index 4cfc786699c7..8341bd965202 100644 --- a/drivers/gpu/drm/amd/amdgpu/atom.c +++ b/drivers/gpu/drm/amd/amdgpu/atom.c @@ -755,6 +755,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg) /* jiffies wrap around we will just wait a little longer */ ctx->last_jump_jiffies = jiffies; } + schedule(); } else { ctx->last_jump = ctx->start + target; ctx->last_jump_jiffies = jiffies; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 8344c3b0b9b5..db9f1e89a0f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -6180,7 +6180,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring) struct v10_gfx_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.gfx_ring[0]; - if (!adev->in_gpu_reset && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(*mqd)); mutex_lock(&adev->srbm_mutex); nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); @@ -6192,7 +6192,7 @@ static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring) mutex_unlock(&adev->srbm_mutex); if (adev->gfx.me.mqd_backup[mqd_idx]) memcpy(adev->gfx.me.mqd_backup[mqd_idx], mqd, sizeof(*mqd)); - } else if (adev->in_gpu_reset) { + } else if (amdgpu_in_reset(adev)) { /* reset mqd with the backup copy */ if (adev->gfx.me.mqd_backup[mqd_idx]) memcpy(mqd, adev->gfx.me.mqd_backup[mqd_idx], sizeof(*mqd)); @@ -6541,7 +6541,7 @@ static int gfx_v10_0_kiq_init_queue(struct amdgpu_ring *ring) gfx_v10_0_kiq_setting(ring); - if (adev->in_gpu_reset) { /* for GPU_RESET case */ + if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */ /* reset MQD to a clean status */ if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd)); @@ -6577,7 +6577,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring) struct v10_compute_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.compute_ring[0]; - if (!adev->in_gpu_reset && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(*mqd)); mutex_lock(&adev->srbm_mutex); nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); @@ -6587,7 +6587,7 @@ static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring) if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd)); - } else if (adev->in_gpu_reset) { /* for GPU_RESET case */ + } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */ /* reset MQD to a clean status */ if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd)); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 33f1c4a46ebe..8d7208959c95 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4632,7 +4632,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) gfx_v8_0_kiq_setting(ring); - if (adev->in_gpu_reset) { /* for GPU_RESET case */ + if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */ /* reset MQD to a clean status */ if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation)); @@ -4669,7 +4669,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring) struct vi_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.compute_ring[0]; - if (!adev->in_gpu_reset && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; @@ -4681,7 +4681,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring) if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation)); - } else if (adev->in_gpu_reset) { /* for GPU_RESET case */ + } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */ /* reset MQD to a clean status */ if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation)); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index cb9d60a4e05e..e4e751f87092 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3684,7 +3684,7 @@ static int gfx_v9_0_kiq_init_queue(struct amdgpu_ring *ring) gfx_v9_0_kiq_setting(ring); - if (adev->in_gpu_reset) { /* for GPU_RESET case */ + if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */ /* reset MQD to a clean status */ if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation)); @@ -3722,7 +3722,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring) struct v9_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.compute_ring[0]; - if (!adev->in_gpu_reset && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(struct v9_mqd_allocation)); ((struct v9_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; ((struct v9_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; @@ -3734,7 +3734,7 @@ static int gfx_v9_0_kcq_init_queue(struct amdgpu_ring *ring) if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct v9_mqd_allocation)); - } else if (adev->in_gpu_reset) { /* for GPU_RESET case */ + } else if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */ /* reset MQD to a clean status */ if (adev->gfx.mec.mqd_backup[mqd_idx]) memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct v9_mqd_allocation)); @@ -3928,7 +3928,7 @@ static int gfx_v9_0_hw_fini(void *handle) /* Use deinitialize sequence from CAIL when unbinding device from driver, * otherwise KIQ is hanging when binding back */ - if (!adev->in_gpu_reset && !adev->in_suspend) { + if (!amdgpu_in_reset(adev) && !adev->in_suspend) { mutex_lock(&adev->srbm_mutex); soc15_grbm_select(adev, adev->gfx.kiq.ring.me, adev->gfx.kiq.ring.pipe, @@ -4086,7 +4086,7 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev) * * also don't wait anymore for IRQ context * */ - if (r < 1 && (adev->in_gpu_reset || in_interrupt())) + if (r < 1 && (amdgpu_in_reset(adev) || in_interrupt())) goto failed_kiq_read; might_sleep(); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 19051ce122b1..1a78073c2f05 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -268,7 +268,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, */ if (adev->gfx.kiq.ring.sched.ready && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - !adev->in_gpu_reset) { + !amdgpu_in_reset(adev)) { struct amdgpu_vmhub *hub = &adev->vmhub[vmhub]; const unsigned eng = 17; @@ -293,7 +293,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, if (!adev->mman.buffer_funcs_enabled || !adev->ib_pool_ready || - adev->in_gpu_reset || + amdgpu_in_reset(adev) || ring->sched.ready == false) { gmc_v10_0_flush_vm_hub(adev, vmid, AMDGPU_GFXHUB_0, 0); mutex_unlock(&adev->mman.gtt_window_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index e18296dc1386..0f8e8aff9114 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -434,7 +434,7 @@ static int gmc_v7_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, int vmid; unsigned int tmp; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; for (vmid = 1; vmid < 16; vmid++) { diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index a9e722b8a458..abe64010f0d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -635,7 +635,7 @@ static int gmc_v8_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, int vmid; unsigned int tmp; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; for (vmid = 1; vmid < 16; vmid++) { diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 6e4f3ff4810f..c5f94bab4a01 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -501,7 +501,7 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, */ if (adev->gfx.kiq.ring.sched.ready && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && - !adev->in_gpu_reset) { + !amdgpu_in_reset(adev)) { uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng; uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; @@ -596,7 +596,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, struct amdgpu_ring *ring = &adev->gfx.kiq.ring; struct amdgpu_kiq *kiq = &adev->gfx.kiq; - if (adev->in_gpu_reset) + if (amdgpu_in_reset(adev)) return -EIO; if (ring->sched.ready) { @@ -633,7 +633,8 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, spin_unlock(&adev->gfx.kiq.ring_lock); r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); if (r < 1) { - DRM_ERROR("wait for kiq fence error: %ld.\n", r); + dev_info(adev->dev, + "wait for kiq fence error: %ld\n", r); return -ETIME; } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 5fd67e1cc2a0..fe31cbeccfe9 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -238,20 +238,16 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; - int locked; /* block amdgpu_gpu_recover till msg FLR COMPLETE received, * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. * - * we can unlock the lock_reset to allow "amdgpu_job_timedout" + * we can unlock the reset_sem to allow "amdgpu_job_timedout" * to run gpu_recover() after FLR_NOTIFICATION_CMPL received * which means host side had finished this VF's FLR. */ - locked = mutex_trylock(&adev->lock_reset); - if (locked) - adev->in_gpu_reset = true; - + down_read(&adev->reset_sem); do { if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) goto flr_done; @@ -261,10 +257,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: - if (locked) { - adev->in_gpu_reset = false; - mutex_unlock(&adev->lock_reset); - } + up_read(&adev->reset_sem); /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index ce2bf1fb79ed..6f55172e8337 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -259,20 +259,16 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; - int locked; /* block amdgpu_gpu_recover till msg FLR COMPLETE received, * otherwise the mailbox msg will be ruined/reseted by * the VF FLR. * - * we can unlock the lock_reset to allow "amdgpu_job_timedout" + * we can unlock the reset_sem to allow "amdgpu_job_timedout" * to run gpu_recover() after FLR_NOTIFICATION_CMPL received * which means host side had finished this VF's FLR. */ - locked = mutex_trylock(&adev->lock_reset); - if (locked) - adev->in_gpu_reset = true; - + down_read(&adev->reset_sem); do { if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) goto flr_done; @@ -282,10 +278,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } while (timeout > 1); flr_done: - if (locked) { - adev->in_gpu_reset = false; - mutex_unlock(&adev->lock_reset); - } + up_read(&adev->reset_sem); /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index e0e60b0d0669..7ad1537820b5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -304,15 +304,17 @@ static void deallocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { - /* On GFX v7, CP doesn't flush TC at dequeue */ - if (q->device->device_info->asic_family == CHIP_HAWAII) - if (flush_texture_cache_nocpsch(q->device, qpd)) - pr_err("Failed to flush TC\n"); + if (!dqm->is_resetting) { + /* On GFX v7, CP doesn't flush TC at dequeue */ + if (q->device->device_info->asic_family == CHIP_HAWAII) + if (flush_texture_cache_nocpsch(q->device, qpd)) + pr_err("Failed to flush TC\n"); - kfd_flush_tlb(qpd_to_pdd(qpd)); + kfd_flush_tlb(qpd_to_pdd(qpd)); - /* Release the vmid mapping */ - set_pasid_vmid_mapping(dqm, 0, qpd->vmid); + /* Release the vmid mapping */ + set_pasid_vmid_mapping(dqm, 0, qpd->vmid); + } dqm->vmid_pasid[qpd->vmid] = 0; qpd->vmid = 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 40695d52e9a8..ee2258404c8f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1551,6 +1551,10 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, void kfd_flush_tlb(struct kfd_process_device *pdd) { struct kfd_dev *dev = pdd->dev; + struct device_queue_manager *dqm = dev->dqm; + + if (dqm->is_resetting) + return; if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { /* Nothing to flush until a VMID is assigned, which diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 03a60c749834..cf79857ece53 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1647,7 +1647,7 @@ static int dm_suspend(void *handle) struct amdgpu_display_manager *dm = &adev->dm; int ret = 0; - if (adev->in_gpu_reset) { + if (amdgpu_in_reset(adev)) { mutex_lock(&dm->dc_lock); dm->cached_dc_state = dc_copy_state(dm->dc->current_state); @@ -1833,7 +1833,7 @@ static int dm_resume(void *handle) struct dc_state *dc_state; int i, r, j; - if (adev->in_gpu_reset) { + if (amdgpu_in_reset(adev)) { dc_state = dm->cached_dc_state; r = dm_dmub_hw_init(adev); diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 727cb9fd4aee..0eeccf3341a3 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -992,7 +992,7 @@ static int smu_disable_dpms(struct smu_context *smu) struct amdgpu_device *adev = smu->adev; int ret = 0; bool use_baco = !smu->is_apu && - ((adev->in_gpu_reset && + ((amdgpu_in_reset(adev) && (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) || ((adev->in_runpm || adev->in_hibernate) && amdgpu_asic_supports_baco(adev))); diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c index 3b8839641770..bacbe2fa1f9a 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c @@ -484,7 +484,7 @@ static int vega20_setup_asic_task(struct pp_hwmgr *hwmgr) { struct amdgpu_device *adev = (struct amdgpu_device *)(hwmgr->adev); int ret = 0; - bool use_baco = (adev->in_gpu_reset && + bool use_baco = (amdgpu_in_reset(adev) && (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) || (adev->in_runpm && amdgpu_asic_supports_baco(adev)); -- 2.11.0