drm/i915: Split I915_RESET_IN_PROGRESS into two flags

author Chris Wilson <chris@chris-wilson.co.uk>

Thu, 16 Mar 2017 17:13:02 +0000 (17:13 +0000)

committer Chris Wilson <chris@chris-wilson.co.uk>

Thu, 16 Mar 2017 17:17:10 +0000 (17:17 +0000)
author Chris Wilson <chris@chris-wilson.co.uk>
Thu, 16 Mar 2017 17:13:02 +0000 (17:13 +0000)
committer Chris Wilson <chris@chris-wilson.co.uk>
Thu, 16 Mar 2017 17:17:10 +0000 (17:17 +0000)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c

index 505c402..d62cf2e 100644 (file)
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1305,16 +1305,18 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
         enum intel_engine_id id;
  
         if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
-               seq_printf(m, "Wedged\n");
-       if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags))
-               seq_printf(m, "Reset in progress\n");
+               seq_puts(m, "Wedged\n");
+       if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
+               seq_puts(m, "Reset in progress: struct_mutex backoff\n");
+       if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
+               seq_puts(m, "Reset in progress: reset handoff to waiter\n");
         if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
-               seq_printf(m, "Waiter holding struct mutex\n");
+               seq_puts(m, "Waiter holding struct mutex\n");
         if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
-               seq_printf(m, "struct_mutex blocked for reset\n");
+               seq_puts(m, "struct_mutex blocked for reset\n");
  
         if (!i915.enable_hangcheck) {
-               seq_printf(m, "Hangcheck disabled\n");
+               seq_puts(m, "Hangcheck disabled\n");
                 return 0;
         }
  
@@ -4127,7 +4129,7 @@ i915_wedged_set(void *data, u64 val)
          * while it is writing to 'i915_wedged'
          */
  
-       if (i915_reset_in_progress(&dev_priv->gpu_error))
+       if (i915_reset_backoff(&dev_priv->gpu_error))
                 return -EAGAIN;
  
         i915_handle_error(dev_priv, val,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c

index 9164167..be3c812 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1815,8 +1815,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
         int ret;
  
         lockdep_assert_held(&dev_priv->drm.struct_mutex);
+       GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
  
-       if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
+       if (!test_bit(I915_RESET_HANDOFF, &error->flags))
                 return;
  
         /* Clear any previous failed attempts at recovery. Time to try again. */
@@ -1869,7 +1870,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
  wakeup:
         i915_gem_reset_finish(dev_priv);
         enable_irq(dev_priv->drm.irq);
-       wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
+
+       clear_bit(I915_RESET_HANDOFF, &error->flags);
+       wake_up_bit(&error->flags, I915_RESET_HANDOFF);
         return;
  
  error:
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h

index 6e14c7d..fcc8184 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1595,8 +1595,33 @@ struct i915_gpu_error {
          */
         unsigned long reset_count;
  
+       /**
+        * flags: Control various stages of the GPU reset
+        *
+        * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
+        * other users acquiring the struct_mutex. To do this we set the
+        * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
+        * and then check for that bit before acquiring the struct_mutex (in
+        * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
+        * secondary role in preventing two concurrent global reset attempts.
+        *
+        * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
+        * struct_mutex. We try to acquire the struct_mutex in the reset worker,
+        * but it may be held by some long running waiter (that we cannot
+        * interrupt without causing trouble). Once we are ready to do the GPU
+        * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
+        * they already hold the struct_mutex and want to participate they can
+        * inspect the bit and do the reset directly, otherwise the worker
+        * waits for the struct_mutex.
+        *
+        * #I915_WEDGED - If reset fails and we can no longer use the GPU,
+        * we set the #I915_WEDGED bit. Prior to command submission, e.g.
+        * i915_gem_request_alloc(), this bit is checked and the sequence
+        * aborted (with -EIO reported to userspace) if set.
+        */
         unsigned long flags;
-#define I915_RESET_IN_PROGRESS 0
+#define I915_RESET_BACKOFF     0
+#define I915_RESET_HANDOFF     1
  #define I915_WEDGED            (BITS_PER_LONG - 1)
  
         /**
@@ -3387,9 +3412,14 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
  
  void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
  
-static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
+static inline bool i915_reset_backoff(struct i915_gpu_error *error)
+{
+       return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
+}
+
+static inline bool i915_reset_handoff(struct i915_gpu_error *error)
  {
-       return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
+       return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
  }
  
  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
@@ -3397,9 +3427,9 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
         return unlikely(test_bit(I915_WEDGED, &error->flags));
  }
  
-static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
+static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
  {
-       return i915_reset_in_progress(error) | i915_terminally_wedged(error);
+       return i915_reset_backoff(error) | i915_terminally_wedged(error);
  }
  
  static inline u32 i915_reset_count(struct i915_gpu_error *error)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c

index d87983b..ecd1b03 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
  
         might_sleep();
  
-       if (!i915_reset_in_progress(error))
-               return 0;
-
         /*
          * Only wait 10 seconds for the gpu reset to complete to avoid hanging
          * userspace. If it takes that long something really bad is going on and
          * we should simply try to bail out and fail as gracefully as possible.
          */
         ret = wait_event_interruptible_timeout(error->reset_queue,
-                                              !i915_reset_in_progress(error),
+                                              !i915_reset_backoff(error),
                                                I915_RESET_TIMEOUT);
         if (ret == 0) {
                 DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c

index 1e1d9f2..0e8d101 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -1012,7 +1012,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
  
  static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request)
  {
-       if (likely(!i915_reset_in_progress(&request->i915->gpu_error)))
+       if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
                 return false;
  
         __set_current_state(TASK_RUNNING);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c

index 6e71639..cb20c94 100644 (file)
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2631,22 +2631,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
         return ret;
  }
  
-static void i915_error_wake_up(struct drm_i915_private *dev_priv)
-{
-       /*
-        * Notify all waiters for GPU completion events that reset state has
-        * been changed, and that they need to restart their wait after
-        * checking for potential errors (and bail out to drop locks if there is
-        * a gpu reset pending so that i915_error_work_func can acquire them).
-        */
-
-       /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
-       wake_up_all(&dev_priv->gpu_error.wait_queue);
-
-       /* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
-       wake_up_all(&dev_priv->pending_flip_queue);
-}
-
  /**
   * i915_reset_and_wakeup - do process context error handling work
   * @dev_priv: i915 device private
@@ -2668,6 +2652,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
  
         intel_prepare_reset(dev_priv);
  
+       set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
+       wake_up_all(&dev_priv->gpu_error.wait_queue);
+
         do {
                 /*
                  * All state reset _must_ be completed before we update the
@@ -2682,7 +2669,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
  
                 /* We need to wait for anyone holding the lock to wakeup */
         } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
-                                    I915_RESET_IN_PROGRESS,
+                                    I915_RESET_HANDOFF,
                                      TASK_UNINTERRUPTIBLE,
                                      HZ));
  
@@ -2696,6 +2683,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
          * Note: The wake_up also serves as a memory barrier so that
          * waiters see the updated value of the dev_priv->gpu_error.
          */
+       clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
         wake_up_all(&dev_priv->gpu_error.reset_queue);
  }
  
@@ -2788,24 +2776,10 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
         if (!engine_mask)
                 goto out;
  
-       if (test_and_set_bit(I915_RESET_IN_PROGRESS,
+       if (test_and_set_bit(I915_RESET_BACKOFF,
                              &dev_priv->gpu_error.flags))
                 goto out;
  
-       /*
-        * Wakeup waiting processes so that the reset function
-        * i915_reset_and_wakeup doesn't deadlock trying to grab
-        * various locks. By bumping the reset counter first, the woken
-        * processes will see a reset in progress and back off,
-        * releasing their locks and then wait for the reset completion.
-        * We must do this for _all_ gpu waiters that might hold locks
-        * that the reset work needs to acquire.
-        *
-        * Note: The wake_up also provides a memory barrier to ensure that the
-        * waiters see the updated value of the reset flags.
-        */
-       i915_error_wake_up(dev_priv);
-
         i915_reset_and_wakeup(dev_priv);
  
  out:
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c

index 4b73513..5959c9b 100644 (file)
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3639,7 +3639,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc)
  {
         struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
  
-       if (i915_reset_in_progress(error))
+       if (i915_reset_backoff(error))
                 return true;
  
         if (crtc->reset_count != i915_reset_count(error))
@@ -10595,7 +10595,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
                 goto cleanup;
  
         intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
-       if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
+       if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) {
                 ret = -EIO;
                 goto unlock;
         }
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c

index d4acee6..6ec7c73 100644 (file)
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -301,7 +301,8 @@ static int igt_global_reset(void *arg)
  
         /* Check that we can issue a global GPU reset */
  
-       set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
+       set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+       set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
  
         mutex_lock(&i915->drm.struct_mutex);
         reset_count = i915_reset_count(&i915->gpu_error);
@@ -314,7 +315,8 @@ static int igt_global_reset(void *arg)
         }
         mutex_unlock(&i915->drm.struct_mutex);
  
-       GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
+       GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
+       clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
         if (i915_terminally_wedged(&i915->gpu_error))
                 err = -EIO;
  
@@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
  
         reset_count = i915_reset_count(&rq->i915->gpu_error);
  
-       set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags);
+       set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
         wake_up_all(&rq->i915->gpu_error.wait_queue);
  
         return reset_count;
@@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg)
  
         /* Check that we detect a stuck waiter and issue a reset */
  
-       set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
+       set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
  
         mutex_lock(&i915->drm.struct_mutex);
         err = hang_init(&h, i915);
@@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg)
                 err = timeout;
                 goto out_rq;
         }
-       GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
  
+       GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
         if (i915_reset_count(&i915->gpu_error) == reset_count) {
                 pr_err("No GPU reset recorded!\n");
                 err = -EINVAL;
@@ -402,6 +404,7 @@ fini:
         hang_fini(&h);
  unlock:
         mutex_unlock(&i915->drm.struct_mutex);
+       clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
  
         if (i915_terminally_wedged(&i915->gpu_error))
                 return -EIO;
@@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg)
         if (!igt_can_mi_store_dword_imm(i915))
                 return 0;
  
+       set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
         mutex_lock(&i915->drm.struct_mutex);
         err = hang_init(&h, i915);
         if (err)
@@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg)
  
                         i915_reset(i915);
  
-                       GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS,
+                       GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
                                             &i915->gpu_error.flags));
+
                         if (prev->fence.error != -EIO) {
                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
                                        prev->fence.error);
@@ -514,6 +519,7 @@ fini:
         hang_fini(&h);
  unlock:
         mutex_unlock(&i915->drm.struct_mutex);
+       clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
  
         if (i915_terminally_wedged(&i915->gpu_error))
                 return -EIO;
author	Chris Wilson <chris@chris-wilson.co.uk>
	Thu, 16 Mar 2017 17:13:02 +0000 (17:13 +0000)
committer	Chris Wilson <chris@chris-wilson.co.uk>
	Thu, 16 Mar 2017 17:17:10 +0000 (17:17 +0000)
drivers/gpu/drm/i915/i915_debugfs.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_drv.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_drv.h		patch \| blob \| history
drivers/gpu/drm/i915/i915_gem.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_gem_request.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_irq.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_display.c		patch \| blob \| history
drivers/gpu/drm/i915/selftests/intel_hangcheck.c		patch \| blob \| history