OSDN Git Service

drm/i915/selftest: Fix hangcheck self test for GuC submission
authorJohn Harrison <John.C.Harrison@Intel.com>
Tue, 27 Jul 2021 00:23:45 +0000 (17:23 -0700)
committerJohn Harrison <John.C.Harrison@Intel.com>
Wed, 28 Jul 2021 00:32:23 +0000 (17:32 -0700)
When GuC submission is enabled, the GuC controls engine resets. Rather
than explicitly triggering a reset, the driver must submit a hanging
context to GuC and wait for the reset to occur.

Conversely, one of the tests specifically sends hanging batches to the
engines but wants them to sit around until a manual reset of the full
GT (including GuC itself). That means disabling GuC based engine
resets to prevent those from killing the hanging batch too soon. So,
add support to the scheduling policy helper for disabling resets as
well as making them quicker!

In GuC submission mode, the 'is engine idle' test basically turns into
'is engine PM wakelock held'. Independently, there is a heartbeat
disable helper function that the tests use. For unexplained reasons,
this acquires the engine wakelock before disabling the heartbeat and
only releases it when re-enabling the heartbeat. As one of the tests
tries to do a wait for idle in the middle of a heartbeat disabled
section, it is therefore guaranteed to always fail. Added a 'no_pm'
variant of the heartbeat helper that allows the engine to be asleep
while also having heartbeats disabled.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210727002348.97202-31-matthew.brost@intel.com
drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.h
drivers/gpu/drm/i915/gt/selftest_hangcheck.c
drivers/gpu/drm/i915/gt/selftest_mocs.c
drivers/gpu/drm/i915/gt/selftest_workarounds.c
drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.c
drivers/gpu/drm/i915/selftests/intel_scheduler_helpers.h

index 4896e4c..317eebf 100644 (file)
@@ -405,3 +405,25 @@ void st_engine_heartbeat_enable(struct intel_engine_cs *engine)
        engine->props.heartbeat_interval_ms =
                engine->defaults.heartbeat_interval_ms;
 }
+
+void st_engine_heartbeat_disable_no_pm(struct intel_engine_cs *engine)
+{
+       engine->props.heartbeat_interval_ms = 0;
+
+       /*
+        * Park the heartbeat but without holding the PM lock as that
+        * makes the engines appear not-idle. Note that if/when unpark
+        * is called due to the PM lock being acquired later the
+        * heartbeat still won't be enabled because of the above = 0.
+        */
+       if (intel_engine_pm_get_if_awake(engine)) {
+               intel_engine_park_heartbeat(engine);
+               intel_engine_pm_put(engine);
+       }
+}
+
+void st_engine_heartbeat_enable_no_pm(struct intel_engine_cs *engine)
+{
+       engine->props.heartbeat_interval_ms =
+               engine->defaults.heartbeat_interval_ms;
+}
index cd27113..81da2cd 100644 (file)
@@ -9,6 +9,8 @@
 struct intel_engine_cs;
 
 void st_engine_heartbeat_disable(struct intel_engine_cs *engine);
+void st_engine_heartbeat_disable_no_pm(struct intel_engine_cs *engine);
 void st_engine_heartbeat_enable(struct intel_engine_cs *engine);
+void st_engine_heartbeat_enable_no_pm(struct intel_engine_cs *engine);
 
 #endif /* SELFTEST_ENGINE_HEARTBEAT_H */
index f93ba40..e0e200b 100644 (file)
@@ -17,6 +17,8 @@
 #include "selftests/igt_flush_test.h"
 #include "selftests/igt_reset.h"
 #include "selftests/igt_atomic.h"
+#include "selftests/igt_spinner.h"
+#include "selftests/intel_scheduler_helpers.h"
 
 #include "selftests/mock_drm.h"
 
@@ -450,6 +452,14 @@ static int igt_reset_nop_engine(void *arg)
                IGT_TIMEOUT(end_time);
                int err;
 
+               if (intel_engine_uses_guc(engine)) {
+                       /* Engine level resets are triggered by GuC when a hang
+                        * is detected. They can't be triggered by the KMD any
+                        * more. Thus a nop batch cannot be used as a reset test
+                        */
+                       continue;
+               }
+
                ce = intel_context_create(engine);
                if (IS_ERR(ce)) {
                        pr_err("[%s] Create context failed: %d!\n", engine->name, err);
@@ -561,6 +571,10 @@ static int igt_reset_fail_engine(void *arg)
                IGT_TIMEOUT(end_time);
                int err;
 
+               /* Can't manually break the reset if i915 doesn't perform it */
+               if (intel_engine_uses_guc(engine))
+                       continue;
+
                ce = intel_context_create(engine);
                if (IS_ERR(ce)) {
                        pr_err("[%s] Create context failed: %d!\n", engine->name, err);
@@ -700,8 +714,12 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
        for_each_engine(engine, gt, id) {
                unsigned int reset_count, reset_engine_count;
                unsigned long count;
+               bool using_guc = intel_engine_uses_guc(engine);
                IGT_TIMEOUT(end_time);
 
+               if (using_guc && !active)
+                       continue;
+
                if (active && !intel_engine_can_store_dword(engine))
                        continue;
 
@@ -719,15 +737,24 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
                count = 0;
                do {
-                       if (active) {
-                               struct i915_request *rq;
+                       struct i915_request *rq = NULL;
+                       struct intel_selftest_saved_policy saved;
+                       int err2;
+
+                       err = intel_selftest_modify_policy(engine, &saved,
+                                                          SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
+                       if (err) {
+                               pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
+                               break;
+                       }
 
+                       if (active) {
                                rq = hang_create_request(&h, engine);
                                if (IS_ERR(rq)) {
                                        err = PTR_ERR(rq);
                                        pr_err("[%s] Create hang request failed: %d!\n",
                                               engine->name, err);
-                                       break;
+                                       goto restore;
                                }
 
                                i915_request_get(rq);
@@ -743,34 +770,59 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
 
                                        i915_request_put(rq);
                                        err = -EIO;
-                                       break;
+                                       goto restore;
                                }
+                       }
 
-                               i915_request_put(rq);
+                       if (!using_guc) {
+                               err = intel_engine_reset(engine, NULL);
+                               if (err) {
+                                       pr_err("intel_engine_reset(%s) failed, err:%d\n",
+                                              engine->name, err);
+                                       goto skip;
+                               }
                        }
 
-                       err = intel_engine_reset(engine, NULL);
-                       if (err) {
-                               pr_err("intel_engine_reset(%s) failed, err:%d\n",
-                                      engine->name, err);
-                               break;
+                       if (rq) {
+                               /* Ensure the reset happens and kills the engine */
+                               err = intel_selftest_wait_for_rq(rq);
+                               if (err)
+                                       pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
+                                              engine->name, rq->fence.context,
+                                              rq->fence.seqno, rq->context->guc_id, err);
                        }
 
+skip:
+                       if (rq)
+                               i915_request_put(rq);
+
                        if (i915_reset_count(global) != reset_count) {
                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
                                err = -EINVAL;
-                               break;
+                               goto restore;
                        }
 
-                       if (i915_reset_engine_count(global, engine) !=
-                           ++reset_engine_count) {
-                               pr_err("%s engine reset not recorded!\n",
-                                      engine->name);
-                               err = -EINVAL;
-                               break;
+                       /* GuC based resets are not logged per engine */
+                       if (!using_guc) {
+                               if (i915_reset_engine_count(global, engine) !=
+                                   ++reset_engine_count) {
+                                       pr_err("%s engine reset not recorded!\n",
+                                              engine->name);
+                                       err = -EINVAL;
+                                       goto restore;
+                               }
                        }
 
                        count++;
+
+restore:
+                       err2 = intel_selftest_restore_policy(engine, &saved);
+                       if (err2)
+                               pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
+                       if (err == 0)
+                               err = err2;
+                       if (err)
+                               break;
                } while (time_before(jiffies, end_time));
                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
                st_engine_heartbeat_enable(engine);
@@ -943,10 +995,13 @@ static int __igt_reset_engines(struct intel_gt *gt,
                struct active_engine threads[I915_NUM_ENGINES] = {};
                unsigned long device = i915_reset_count(global);
                unsigned long count = 0, reported;
+               bool using_guc = intel_engine_uses_guc(engine);
                IGT_TIMEOUT(end_time);
 
-               if (flags & TEST_ACTIVE &&
-                   !intel_engine_can_store_dword(engine))
+               if (flags & TEST_ACTIVE) {
+                       if (!intel_engine_can_store_dword(engine))
+                               continue;
+               } else if (using_guc)
                        continue;
 
                if (!wait_for_idle(engine)) {
@@ -986,10 +1041,19 @@ static int __igt_reset_engines(struct intel_gt *gt,
 
                yield(); /* start all threads before we begin */
 
-               st_engine_heartbeat_disable(engine);
+               st_engine_heartbeat_disable_no_pm(engine);
                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
                do {
                        struct i915_request *rq = NULL;
+                       struct intel_selftest_saved_policy saved;
+                       int err2;
+
+                       err = intel_selftest_modify_policy(engine, &saved,
+                                                          SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
+                       if (err) {
+                               pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
+                               break;
+                       }
 
                        if (flags & TEST_ACTIVE) {
                                rq = hang_create_request(&h, engine);
@@ -997,7 +1061,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
                                        err = PTR_ERR(rq);
                                        pr_err("[%s] Create hang request failed: %d!\n",
                                               engine->name, err);
-                                       break;
+                                       goto restore;
                                }
 
                                i915_request_get(rq);
@@ -1013,15 +1077,28 @@ static int __igt_reset_engines(struct intel_gt *gt,
 
                                        i915_request_put(rq);
                                        err = -EIO;
-                                       break;
+                                       goto restore;
                                }
+                       } else {
+                               intel_engine_pm_get(engine);
                        }
 
-                       err = intel_engine_reset(engine, NULL);
-                       if (err) {
-                               pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
-                                      engine->name, test_name, err);
-                               break;
+                       if (!using_guc) {
+                               err = intel_engine_reset(engine, NULL);
+                               if (err) {
+                                       pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
+                                              engine->name, test_name, err);
+                                       goto restore;
+                               }
+                       }
+
+                       if (rq) {
+                               /* Ensure the reset happens and kills the engine */
+                               err = intel_selftest_wait_for_rq(rq);
+                               if (err)
+                                       pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
+                                              engine->name, rq->fence.context,
+                                              rq->fence.seqno, rq->context->guc_id, err);
                        }
 
                        count++;
@@ -1037,7 +1114,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
                                        GEM_TRACE_DUMP();
                                        intel_gt_set_wedged(gt);
                                        err = -EIO;
-                                       break;
+                                       goto restore;
                                }
 
                                if (i915_request_wait(rq, 0, HZ / 5) < 0) {
@@ -1056,12 +1133,15 @@ static int __igt_reset_engines(struct intel_gt *gt,
                                        GEM_TRACE_DUMP();
                                        intel_gt_set_wedged(gt);
                                        err = -EIO;
-                                       break;
+                                       goto restore;
                                }
 
                                i915_request_put(rq);
                        }
 
+                       if (!(flags & TEST_ACTIVE))
+                               intel_engine_pm_put(engine);
+
                        if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
                                struct drm_printer p =
                                        drm_info_printer(gt->i915->drm.dev);
@@ -1073,22 +1153,34 @@ static int __igt_reset_engines(struct intel_gt *gt,
                                                  "%s\n", engine->name);
 
                                err = -EIO;
-                               break;
+                               goto restore;
                        }
+
+restore:
+                       err2 = intel_selftest_restore_policy(engine, &saved);
+                       if (err2)
+                               pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
+                       if (err == 0)
+                               err = err2;
+                       if (err)
+                               break;
                } while (time_before(jiffies, end_time));
                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
-               st_engine_heartbeat_enable(engine);
+               st_engine_heartbeat_enable_no_pm(engine);
 
                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
                        engine->name, test_name, count);
 
-               reported = i915_reset_engine_count(global, engine);
-               reported -= threads[engine->id].resets;
-               if (reported != count) {
-                       pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
-                              engine->name, test_name, count, reported);
-                       if (!err)
-                               err = -EINVAL;
+               /* GuC based resets are not logged per engine */
+               if (!using_guc) {
+                       reported = i915_reset_engine_count(global, engine);
+                       reported -= threads[engine->id].resets;
+                       if (reported != count) {
+                               pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
+                                      engine->name, test_name, count, reported);
+                               if (!err)
+                                       err = -EINVAL;
+                       }
                }
 
 unwind:
@@ -1107,15 +1199,18 @@ unwind:
                        }
                        put_task_struct(threads[tmp].task);
 
-                       if (other->uabi_class != engine->uabi_class &&
-                           threads[tmp].resets !=
-                           i915_reset_engine_count(global, other)) {
-                               pr_err("Innocent engine %s was reset (count=%ld)\n",
-                                      other->name,
-                                      i915_reset_engine_count(global, other) -
-                                      threads[tmp].resets);
-                               if (!err)
-                                       err = -EINVAL;
+                       /* GuC based resets are not logged per engine */
+                       if (!using_guc) {
+                               if (other->uabi_class != engine->uabi_class &&
+                                   threads[tmp].resets !=
+                                   i915_reset_engine_count(global, other)) {
+                                       pr_err("Innocent engine %s was reset (count=%ld)\n",
+                                              other->name,
+                                              i915_reset_engine_count(global, other) -
+                                              threads[tmp].resets);
+                                       if (!err)
+                                               err = -EINVAL;
+                               }
                        }
                }
 
@@ -1555,18 +1650,29 @@ static int igt_reset_queue(void *arg)
                goto unlock;
 
        for_each_engine(engine, gt, id) {
+               struct intel_selftest_saved_policy saved;
                struct i915_request *prev;
                IGT_TIMEOUT(end_time);
                unsigned int count;
+               bool using_guc = intel_engine_uses_guc(engine);
 
                if (!intel_engine_can_store_dword(engine))
                        continue;
 
+               if (using_guc) {
+                       err = intel_selftest_modify_policy(engine, &saved,
+                                                          SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
+                       if (err) {
+                               pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
+                               goto fini;
+                       }
+               }
+
                prev = hang_create_request(&h, engine);
                if (IS_ERR(prev)) {
                        err = PTR_ERR(prev);
                        pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
-                       goto fini;
+                       goto restore;
                }
 
                i915_request_get(prev);
@@ -1581,7 +1687,7 @@ static int igt_reset_queue(void *arg)
                        if (IS_ERR(rq)) {
                                err = PTR_ERR(rq);
                                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
-                               goto fini;
+                               goto restore;
                        }
 
                        i915_request_get(rq);
@@ -1606,7 +1712,7 @@ static int igt_reset_queue(void *arg)
 
                                GEM_TRACE_DUMP();
                                intel_gt_set_wedged(gt);
-                               goto fini;
+                               goto restore;
                        }
 
                        if (!wait_until_running(&h, prev)) {
@@ -1624,7 +1730,7 @@ static int igt_reset_queue(void *arg)
                                intel_gt_set_wedged(gt);
 
                                err = -EIO;
-                               goto fini;
+                               goto restore;
                        }
 
                        reset_count = fake_hangcheck(gt, BIT(id));
@@ -1635,7 +1741,7 @@ static int igt_reset_queue(void *arg)
                                i915_request_put(rq);
                                i915_request_put(prev);
                                err = -EINVAL;
-                               goto fini;
+                               goto restore;
                        }
 
                        if (rq->fence.error) {
@@ -1644,7 +1750,7 @@ static int igt_reset_queue(void *arg)
                                i915_request_put(rq);
                                i915_request_put(prev);
                                err = -EINVAL;
-                               goto fini;
+                               goto restore;
                        }
 
                        if (i915_reset_count(global) == reset_count) {
@@ -1652,7 +1758,7 @@ static int igt_reset_queue(void *arg)
                                i915_request_put(rq);
                                i915_request_put(prev);
                                err = -EINVAL;
-                               goto fini;
+                               goto restore;
                        }
 
                        i915_request_put(prev);
@@ -1667,6 +1773,19 @@ static int igt_reset_queue(void *arg)
 
                i915_request_put(prev);
 
+restore:
+               if (using_guc) {
+                       int err2 = intel_selftest_restore_policy(engine, &saved);
+
+                       if (err2)
+                               pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
+                                      __func__, __LINE__, engine->name, err2);
+                       if (err == 0)
+                               err = err2;
+               }
+               if (err)
+                       goto fini;
+
                err = igt_flush_test(gt->i915);
                if (err) {
                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
index b731473..13d25bf 100644 (file)
@@ -408,7 +408,8 @@ static int live_mocs_reset(void *arg)
                struct intel_context *ce;
                int err2;
 
-               err = intel_selftest_modify_policy(engine, &saved);
+               err = intel_selftest_modify_policy(engine, &saved,
+                                                  SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
                if (err)
                        break;
 
index ba7ee69..e623ac4 100644 (file)
@@ -810,7 +810,8 @@ static int live_reset_whitelist(void *arg)
                                struct intel_selftest_saved_policy saved;
                                int err2;
 
-                               err = intel_selftest_modify_policy(engine, &saved);
+                               err = intel_selftest_modify_policy(engine, &saved,
+                                                                  SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
                                if (err)
                                        goto out;
 
@@ -1278,7 +1279,8 @@ live_engine_reset_workarounds(void *arg)
                int ret2;
 
                pr_info("Verifying after %s reset...\n", engine->name);
-               ret = intel_selftest_modify_policy(engine, &saved);
+               ret = intel_selftest_modify_policy(engine, &saved,
+                                                  SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
                if (ret)
                        break;
 
index 5cdee13..dac275e 100644 (file)
@@ -15,7 +15,8 @@
 #define WAIT_FOR_RESET_TIME    1000
 
 int intel_selftest_modify_policy(struct intel_engine_cs *engine,
-                                struct intel_selftest_saved_policy *saved)
+                                struct intel_selftest_saved_policy *saved,
+                                u32 modify_type)
 
 {
        int err;
@@ -25,18 +26,30 @@ int intel_selftest_modify_policy(struct intel_engine_cs *engine,
        saved->timeslice = engine->props.timeslice_duration_ms;
        saved->preempt_timeout = engine->props.preempt_timeout_ms;
 
-       /*
-        * Enable force pre-emption on time slice expiration
-        * together with engine reset on pre-emption timeout.
-        * This is required to make the GuC notice and reset
-        * the single hanging context.
-        * Also, reduce the preemption timeout to something
-        * small to speed the test up.
-        */
-       engine->i915->params.reset = 2;
-       engine->flags |= I915_ENGINE_WANT_FORCED_PREEMPTION;
-       engine->props.timeslice_duration_ms = REDUCED_TIMESLICE;
-       engine->props.preempt_timeout_ms = REDUCED_PREEMPT;
+       switch (modify_type) {
+       case SELFTEST_SCHEDULER_MODIFY_FAST_RESET:
+               /*
+                * Enable force pre-emption on time slice expiration
+                * together with engine reset on pre-emption timeout.
+                * This is required to make the GuC notice and reset
+                * the single hanging context.
+                * Also, reduce the preemption timeout to something
+                * small to speed the test up.
+                */
+               engine->i915->params.reset = 2;
+               engine->flags |= I915_ENGINE_WANT_FORCED_PREEMPTION;
+               engine->props.timeslice_duration_ms = REDUCED_TIMESLICE;
+               engine->props.preempt_timeout_ms = REDUCED_PREEMPT;
+               break;
+
+       case SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK:
+               engine->props.preempt_timeout_ms = 0;
+               break;
+
+       default:
+               pr_err("Invalid scheduler policy modification type: %d!\n", modify_type);
+               return -EINVAL;
+       }
 
        if (!intel_engine_uses_guc(engine))
                return 0;
index 79605b1..35c0986 100644 (file)
@@ -18,8 +18,14 @@ struct intel_selftest_saved_policy {
        u64 preempt_timeout;
 };
 
+enum selftest_scheduler_modify {
+       SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK = 0,
+       SELFTEST_SCHEDULER_MODIFY_FAST_RESET,
+};
+
 int intel_selftest_modify_policy(struct intel_engine_cs *engine,
-                                struct intel_selftest_saved_policy *saved);
+                                struct intel_selftest_saved_policy *saved,
+                                enum selftest_scheduler_modify modify_type);
 int intel_selftest_restore_policy(struct intel_engine_cs *engine,
                                  struct intel_selftest_saved_policy *saved);
 int intel_selftest_wait_for_rq(struct i915_request *rq);