OSDN Git Service

media: venus: Rework error fail recover logic
authorMauro Carvalho Chehab <mchehab+huawei@kernel.org>
Tue, 27 Apr 2021 08:39:47 +0000 (10:39 +0200)
committerMauro Carvalho Chehab <mchehab+huawei@kernel.org>
Mon, 10 May 2021 09:36:33 +0000 (11:36 +0200)
The Venus code has a sort of watchdog that attempts to recover
from IP errors, implemented as a delayed work job, which
calls venus_sys_error_handler().

Right now, it has several issues:

1. It assumes that PM runtime resume never fails

2. It internally runs two while() loops that also assume that
   PM runtime will never fail to go idle:

while (pm_runtime_active(core->dev_dec) || pm_runtime_active(core->dev_enc))
msleep(10);

...

while (core->pmdomains[0] && pm_runtime_active(core->pmdomains[0]))
usleep_range(1000, 1500);

3. It uses an OR to merge all return codes and then report to the user

4. If the hardware never recovers, it keeps running on every 10ms,
   flooding the syslog with 2 messages (so, up to 200 messages
   per second).

Rework the code, in order to prevent that, by:

1. check the return code from PM runtime resume;
2. don't let the while() loops run forever;
3. store the failed event;
4. use warn ratelimited when it fails to recover.

Fixes: af2c3834c8ca ("[media] media: venus: adding core part and helper functions")
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
drivers/media/platform/qcom/venus/core.c

index 54bac7e..91b1584 100644 (file)
@@ -78,22 +78,32 @@ static const struct hfi_core_ops venus_core_ops = {
        .event_notify = venus_event_notify,
 };
 
+#define RPM_WAIT_FOR_IDLE_MAX_ATTEMPTS 10
+
 static void venus_sys_error_handler(struct work_struct *work)
 {
        struct venus_core *core =
                        container_of(work, struct venus_core, work.work);
-       int ret = 0;
-
-       pm_runtime_get_sync(core->dev);
+       int ret, i, max_attempts = RPM_WAIT_FOR_IDLE_MAX_ATTEMPTS;
+       const char *err_msg = "";
+       bool failed = false;
+
+       ret = pm_runtime_get_sync(core->dev);
+       if (ret < 0) {
+               err_msg = "resume runtime PM";
+               max_attempts = 0;
+               failed = true;
+       }
 
        hfi_core_deinit(core, true);
 
-       dev_warn(core->dev, "system error has occurred, starting recovery!\n");
-
        mutex_lock(&core->lock);
 
-       while (pm_runtime_active(core->dev_dec) || pm_runtime_active(core->dev_enc))
+       for (i = 0; i < max_attempts; i++) {
+               if (!pm_runtime_active(core->dev_dec) && !pm_runtime_active(core->dev_enc))
+                       break;
                msleep(10);
+       }
 
        venus_shutdown(core);
 
@@ -101,31 +111,55 @@ static void venus_sys_error_handler(struct work_struct *work)
 
        pm_runtime_put_sync(core->dev);
 
-       while (core->pmdomains[0] && pm_runtime_active(core->pmdomains[0]))
+       for (i = 0; i < max_attempts; i++) {
+               if (!core->pmdomains[0] || !pm_runtime_active(core->pmdomains[0]))
+                       break;
                usleep_range(1000, 1500);
+       }
 
        hfi_reinit(core);
 
-       pm_runtime_get_sync(core->dev);
+       ret = pm_runtime_get_sync(core->dev);
+       if (ret < 0) {
+               err_msg = "resume runtime PM";
+               failed = true;
+       }
 
-       ret |= venus_boot(core);
-       ret |= hfi_core_resume(core, true);
+       ret = venus_boot(core);
+       if (ret && !failed) {
+               err_msg = "boot Venus";
+               failed = true;
+       }
+
+       ret = hfi_core_resume(core, true);
+       if (ret && !failed) {
+               err_msg = "resume HFI";
+               failed = true;
+       }
 
        enable_irq(core->irq);
 
        mutex_unlock(&core->lock);
 
-       ret |= hfi_core_init(core);
+       ret = hfi_core_init(core);
+       if (ret && !failed) {
+               err_msg = "init HFI";
+               failed = true;
+       }
 
        pm_runtime_put_sync(core->dev);
 
-       if (ret) {
+       if (failed) {
                disable_irq_nosync(core->irq);
-               dev_warn(core->dev, "recovery failed (%d)\n", ret);
+               dev_warn_ratelimited(core->dev,
+                                    "System error has occurred, recovery failed to %s\n",
+                                    err_msg);
                schedule_delayed_work(&core->work, msecs_to_jiffies(10));
                return;
        }
 
+       dev_warn(core->dev, "system error has occurred (recovered)\n");
+
        mutex_lock(&core->lock);
        core->sys_error = false;
        mutex_unlock(&core->lock);