OSDN Git Service

habanalabs: disable CPU access on timeouts
authorOded Gabbay <oded.gabbay@gmail.com>
Thu, 28 Feb 2019 08:46:12 +0000 (10:46 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 28 Feb 2019 12:04:59 +0000 (13:04 +0100)
This patch provides a workaround for a bug in the F/W where the response
time for a request from KMD may take more then 100ms. This could cause the
queue between KMD and the F/W to get out of sync.

The WA is to:
1. Increase the timeout of ALL requests to 1s.
2. In case a request isn't answered in time, mark the state as
"cpu_disabled" and prevent sending further requests from KMD to the F/W.
This will eventually lead to a heartbeat failure and hard reset of the
device.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/misc/habanalabs/debugfs.c
drivers/misc/habanalabs/device.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/habanalabs.h
drivers/misc/habanalabs/hwmon.c
drivers/misc/habanalabs/sysfs.c

index f472b57..1d2bbcf 100644 (file)
@@ -723,7 +723,7 @@ static ssize_t hl_device_read(struct file *f, char __user *buf,
                return 0;
 
        sprintf(tmp_buf,
-               "Valid values are: disable, enable, suspend, resume\n");
+               "Valid values: disable, enable, suspend, resume, cpu_timeout\n");
        rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
                        strlen(tmp_buf) + 1);
 
@@ -751,9 +751,11 @@ static ssize_t hl_device_write(struct file *f, const char __user *buf,
                hdev->asic_funcs->suspend(hdev);
        } else if (strncmp("resume", data, strlen("resume")) == 0) {
                hdev->asic_funcs->resume(hdev);
+       } else if (strncmp("cpu_timeout", data, strlen("cpu_timeout")) == 0) {
+               hdev->device_cpu_disabled = true;
        } else {
                dev_err(hdev->dev,
-                       "Valid values are: disable, enable, suspend, resume\n");
+                       "Valid values: disable, enable, suspend, resume, cpu_timeout\n");
                count = -EINVAL;
        }
 
index 120d30a..de46aa6 100644 (file)
@@ -636,6 +636,8 @@ again:
        /* Finished tear-down, starting to re-initialize */
 
        if (hard_reset) {
+               hdev->device_cpu_disabled = false;
+
                /* Allocate the kernel context */
                hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
                                                GFP_KERNEL);
index 7c2edab..5780041 100644 (file)
@@ -3232,6 +3232,11 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
        if (hdev->disabled)
                goto out;
 
+       if (hdev->device_cpu_disabled) {
+               rc = -EIO;
+               goto out;
+       }
+
        rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len,
                        pkt_dma_addr);
        if (rc) {
@@ -3245,8 +3250,8 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
        hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ);
 
        if (rc == -ETIMEDOUT) {
-               dev_err(hdev->dev,
-                       "Timeout while waiting for CPU packet fence\n");
+               dev_err(hdev->dev, "Timeout while waiting for device CPU\n");
+               hdev->device_cpu_disabled = true;
                goto out;
        }
 
index 59b25c6..a7c95e9 100644 (file)
@@ -1079,6 +1079,7 @@ struct hl_device_reset_work {
  * @dram_default_page_mapping: is DRAM default page mapping enabled.
  * @init_done: is the initialization of the device done.
  * @mmu_enable: is MMU enabled.
+ * @device_cpu_disabled: is the device CPU disabled (due to timeouts)
  */
 struct hl_device {
        struct pci_dev                  *pdev;
@@ -1146,6 +1147,7 @@ struct hl_device {
        u8                              dram_supports_virtual_memory;
        u8                              dram_default_page_mapping;
        u8                              init_done;
+       u8                              device_cpu_disabled;
 
        /* Parameters for bring-up */
        u8                              mmu_enable;
index 9c359a1..7eec21f 100644 (file)
@@ -10,7 +10,7 @@
 #include <linux/pci.h>
 #include <linux/hwmon.h>
 
-#define SENSORS_PKT_TIMEOUT            100000  /* 100ms */
+#define SENSORS_PKT_TIMEOUT            1000000 /* 1s */
 #define HWMON_NR_SENSOR_TYPES          (hwmon_pwm + 1)
 
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
index 6d80e7e..12c7821 100644 (file)
@@ -9,8 +9,8 @@
 
 #include <linux/pci.h>
 
-#define SET_CLK_PKT_TIMEOUT    200000  /* 200ms */
-#define SET_PWR_PKT_TIMEOUT    400000  /* 400ms */
+#define SET_CLK_PKT_TIMEOUT    1000000 /* 1s */
+#define SET_PWR_PKT_TIMEOUT    1000000 /* 1s */
 
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 {