OSDN Git Service

drm/amdgpu: add codes to capture invalid hardware access when recovery
authorDennis Li <Dennis.Li@amd.com>
Wed, 10 Mar 2021 09:20:45 +0000 (17:20 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Apr 2021 20:34:53 +0000 (16:34 -0400)
When recovery thread has begun GPU reset, there should be not other
threads to access hardware, otherwise system randomly hang.

v2 (chk): rewritten from scratch, use trylock and lockdep instead of
hand wiring the logic.

v3: add in_irq check

v4: change to check in_task

Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

index a037c22..4c2fabe 100644 (file)
@@ -1390,6 +1390,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev);
 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev);
 bool amdgpu_device_load_pci_state(struct pci_dev *pdev);
 
+bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev);
+
 #include "amdgpu_object.h"
 
 static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
index 0f82c5d..2d08062 100644 (file)
@@ -326,6 +326,35 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 /*
  * register access helper functions.
  */
+
+/* Check if hw access should be skipped because of hotplug or device error */
+bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
+{
+       if (adev->in_pci_err_recovery)
+               return true;
+
+#ifdef CONFIG_LOCKDEP
+       /*
+        * This is a bit complicated to understand, so worth a comment. What we assert
+        * here is that the GPU reset is not running on another thread in parallel.
+        *
+        * For this we trylock the read side of the reset semaphore, if that succeeds
+        * we know that the reset is not running in paralell.
+        *
+        * If the trylock fails we assert that we are either already holding the read
+        * side of the lock or are the reset thread itself and hold the write side of
+        * the lock.
+        */
+       if (in_task()) {
+               if (down_read_trylock(&adev->reset_sem))
+                       up_read(&adev->reset_sem);
+               else
+                       lockdep_assert_held(&adev->reset_sem);
+       }
+#endif
+       return false;
+}
+
 /**
  * amdgpu_device_rreg - read a memory mapped IO or indirect register
  *
@@ -340,7 +369,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 {
        uint32_t ret;
 
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if ((reg * 4) < adev->rmmio_size) {
@@ -377,7 +406,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
  */
 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if (offset < adev->rmmio_size)
@@ -402,7 +431,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
  */
 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if (offset < adev->rmmio_size)
@@ -425,7 +454,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
                        uint32_t reg, uint32_t v,
                        uint32_t acc_flags)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if ((reg * 4) < adev->rmmio_size) {
@@ -452,7 +481,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
                             uint32_t reg, uint32_t v)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if (amdgpu_sriov_fullaccess(adev) &&
@@ -476,7 +505,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
  */
 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if (index < adev->doorbell.num_doorbells) {
@@ -499,7 +528,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
  */
 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if (index < adev->doorbell.num_doorbells) {
@@ -520,7 +549,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
  */
 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        if (index < adev->doorbell.num_doorbells) {
@@ -543,7 +572,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
  */
 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 {
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        if (index < adev->doorbell.num_doorbells) {
index 689addb..f63f66a 100644 (file)
@@ -705,7 +705,7 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
        struct amdgpu_kiq *kiq = &adev->gfx.kiq;
        struct amdgpu_ring *ring = &kiq->ring;
 
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return 0;
 
        BUG_ON(!ring->funcs->emit_rreg);
@@ -772,7 +772,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 
        BUG_ON(!ring->funcs->emit_wreg);
 
-       if (adev->in_pci_err_recovery)
+       if (amdgpu_device_skip_hw_access(adev))
                return;
 
        spin_lock_irqsave(&kiq->ring_lock, flags);