OSDN Git Service

scsi: smartpqi: Capture controller reason codes
authorMurthy Bhat <Murthy.Bhat@microchip.com>
Tue, 28 Sep 2021 23:54:34 +0000 (18:54 -0500)
committerMartin K. Petersen <martin.petersen@oracle.com>
Tue, 5 Oct 2021 04:13:14 +0000 (00:13 -0400)
In some rare cases, the driver can halt the controller. Add a reason code
describing why the controller was halted.  Store this reason code in a
controller register to aid in debugging the issue.

Link: https://lore.kernel.org/r/20210928235442.201875-4-don.brace@microchip.com
Reviewed-by: Scott Benesh <scott.benesh@microchip.com>
Reviewed-by: Scott Teel <scott.teel@microchip.com>
Reviewed-by: Mike McGowen <mike.mcgowen@microchip.com>
Acked-by: John Donnelly <john.p.donnelly@oracle.com>
Signed-off-by: Murthy Bhat <Murthy.Bhat@microchip.com>
Signed-off-by: Don Brace <don.brace@microchip.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
drivers/scsi/smartpqi/smartpqi.h
drivers/scsi/smartpqi/smartpqi_init.c
drivers/scsi/smartpqi/smartpqi_sis.c
drivers/scsi/smartpqi/smartpqi_sis.h

index 70eca20..d66863f 100644 (file)
@@ -82,9 +82,11 @@ struct pqi_ctrl_registers {
        __le32  sis_product_identifier;                 /* B4h */
        u8      reserved5[0xbc - (0xb4 + sizeof(__le32))];
        __le32  sis_firmware_status;                    /* BCh */
-       u8      reserved6[0x1000 - (0xbc + sizeof(__le32))];
+       u8      reserved6[0xcc - (0xbc + sizeof(__le32))];
+       __le32  sis_ctrl_shutdown_reason_code;          /* CCh */
+       u8      reserved7[0x1000 - (0xcc + sizeof(__le32))];
        __le32  sis_mailbox[8];                         /* 1000h */
-       u8      reserved7[0x4000 - (0x1000 + (sizeof(__le32) * 8))];
+       u8      reserved8[0x4000 - (0x1000 + (sizeof(__le32) * 8))];
        /*
         * The PQI spec states that the PQI registers should be at
         * offset 0 from the PCIe BAR 0.  However, we can't map
@@ -102,6 +104,21 @@ struct pqi_ctrl_registers {
 
 #define PQI_DEVICE_REGISTERS_OFFSET    0x4000
 
+/* shutdown reasons for taking the controller offline */
+enum pqi_ctrl_shutdown_reason {
+       PQI_IQ_NOT_DRAINED_TIMEOUT = 1,
+       PQI_LUN_RESET_TIMEOUT = 2,
+       PQI_IO_PENDING_POST_LUN_RESET_TIMEOUT = 3,
+       PQI_NO_HEARTBEAT = 4,
+       PQI_FIRMWARE_KERNEL_NOT_UP = 5,
+       PQI_OFA_RESPONSE_TIMEOUT = 6,
+       PQI_INVALID_REQ_ID = 7,
+       PQI_UNMATCHED_REQ_ID = 8,
+       PQI_IO_PI_OUT_OF_RANGE = 9,
+       PQI_EVENT_PI_OUT_OF_RANGE = 10,
+       PQI_UNEXPECTED_IU_TYPE = 11
+};
+
 enum pqi_io_path {
        RAID_PATH = 0,
        AIO_PATH = 1
@@ -850,7 +867,8 @@ struct pqi_config_table_firmware_features {
 #define PQI_FIRMWARE_FEATURE_TMF_IU_TIMEOUT                    14
 #define PQI_FIRMWARE_FEATURE_RAID_BYPASS_ON_ENCRYPTED_NVME     15
 #define PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN    16
-#define PQI_FIRMWARE_FEATURE_MAXIMUM                           16
+#define PQI_FIRMWARE_FEATURE_FW_TRIAGE                         17
+#define PQI_FIRMWARE_FEATURE_MAXIMUM                           17
 
 struct pqi_config_table_debug {
        struct pqi_config_table_section_header header;
@@ -1297,6 +1315,7 @@ struct pqi_ctrl_info {
        u8              raid_iu_timeout_supported : 1;
        u8              tmf_iu_timeout_supported : 1;
        u8              unique_wwid_in_report_phys_lun_supported : 1;
+       u8              firmware_triage_supported : 1;
        u8              enable_r1_writes : 1;
        u8              enable_r5_writes : 1;
        u8              enable_r6_writes : 1;
index 5655d24..b6ac4d6 100644 (file)
@@ -54,7 +54,8 @@ MODULE_DESCRIPTION("Driver for Microchip Smart Family Controller version "
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL");
 
-static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info);
+static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info,
+       enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason);
 static void pqi_ctrl_offline_worker(struct work_struct *work);
 static int pqi_scan_scsi_devices(struct pqi_ctrl_info *ctrl_info);
 static void pqi_scan_start(struct Scsi_Host *shost);
@@ -226,7 +227,7 @@ static inline void pqi_check_ctrl_health(struct pqi_ctrl_info *ctrl_info)
 {
        if (ctrl_info->controller_online)
                if (!sis_is_firmware_running(ctrl_info))
-                       pqi_take_ctrl_offline(ctrl_info);
+                       pqi_take_ctrl_offline(ctrl_info, PQI_FIRMWARE_KERNEL_NOT_UP);
 }
 
 static inline bool pqi_is_hba_lunid(u8 *scsi3addr)
@@ -3180,9 +3181,10 @@ static int pqi_interpret_task_management_response(struct pqi_ctrl_info *ctrl_inf
        return rc;
 }
 
-static inline void pqi_invalid_response(struct pqi_ctrl_info *ctrl_info)
+static inline void pqi_invalid_response(struct pqi_ctrl_info *ctrl_info,
+       enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason)
 {
-       pqi_take_ctrl_offline(ctrl_info);
+       pqi_take_ctrl_offline(ctrl_info, ctrl_shutdown_reason);
 }
 
 static int pqi_process_io_intr(struct pqi_ctrl_info *ctrl_info, struct pqi_queue_group *queue_group)
@@ -3200,7 +3202,7 @@ static int pqi_process_io_intr(struct pqi_ctrl_info *ctrl_info, struct pqi_queue
        while (1) {
                oq_pi = readl(queue_group->oq_pi);
                if (oq_pi >= ctrl_info->num_elements_per_oq) {
-                       pqi_invalid_response(ctrl_info);
+                       pqi_invalid_response(ctrl_info, PQI_IO_PI_OUT_OF_RANGE);
                        dev_err(&ctrl_info->pci_dev->dev,
                                "I/O interrupt: producer index (%u) out of range (0-%u): consumer index: %u\n",
                                oq_pi, ctrl_info->num_elements_per_oq - 1, oq_ci);
@@ -3215,7 +3217,7 @@ static int pqi_process_io_intr(struct pqi_ctrl_info *ctrl_info, struct pqi_queue
 
                request_id = get_unaligned_le16(&response->request_id);
                if (request_id >= ctrl_info->max_io_slots) {
-                       pqi_invalid_response(ctrl_info);
+                       pqi_invalid_response(ctrl_info, PQI_INVALID_REQ_ID);
                        dev_err(&ctrl_info->pci_dev->dev,
                                "request ID in response (%u) out of range (0-%u): producer index: %u  consumer index: %u\n",
                                request_id, ctrl_info->max_io_slots - 1, oq_pi, oq_ci);
@@ -3224,7 +3226,7 @@ static int pqi_process_io_intr(struct pqi_ctrl_info *ctrl_info, struct pqi_queue
 
                io_request = &ctrl_info->io_request_pool[request_id];
                if (atomic_read(&io_request->refcount) == 0) {
-                       pqi_invalid_response(ctrl_info);
+                       pqi_invalid_response(ctrl_info, PQI_UNMATCHED_REQ_ID);
                        dev_err(&ctrl_info->pci_dev->dev,
                                "request ID in response (%u) does not match an outstanding I/O request: producer index: %u  consumer index: %u\n",
                                request_id, oq_pi, oq_ci);
@@ -3260,7 +3262,7 @@ static int pqi_process_io_intr(struct pqi_ctrl_info *ctrl_info, struct pqi_queue
                        pqi_process_io_error(response->header.iu_type, io_request);
                        break;
                default:
-                       pqi_invalid_response(ctrl_info);
+                       pqi_invalid_response(ctrl_info, PQI_UNEXPECTED_IU_TYPE);
                        dev_err(&ctrl_info->pci_dev->dev,
                                "unexpected IU type: 0x%x: producer index: %u  consumer index: %u\n",
                                response->header.iu_type, oq_pi, oq_ci);
@@ -3442,7 +3444,7 @@ static void pqi_process_soft_reset(struct pqi_ctrl_info *ctrl_info)
                pqi_ofa_free_host_buffer(ctrl_info);
                pqi_ctrl_ofa_done(ctrl_info);
                pqi_ofa_ctrl_unquiesce(ctrl_info);
-               pqi_take_ctrl_offline(ctrl_info);
+               pqi_take_ctrl_offline(ctrl_info, PQI_OFA_RESPONSE_TIMEOUT);
                break;
        }
 }
@@ -3567,7 +3569,7 @@ static void pqi_heartbeat_timer_handler(struct timer_list *t)
                        dev_err(&ctrl_info->pci_dev->dev,
                                "no heartbeat detected - last heartbeat count: %u\n",
                                heartbeat_count);
-                       pqi_take_ctrl_offline(ctrl_info);
+                       pqi_take_ctrl_offline(ctrl_info, PQI_NO_HEARTBEAT);
                        return;
                }
        } else {
@@ -3631,7 +3633,7 @@ static int pqi_process_event_intr(struct pqi_ctrl_info *ctrl_info)
        while (1) {
                oq_pi = readl(event_queue->oq_pi);
                if (oq_pi >= PQI_NUM_EVENT_QUEUE_ELEMENTS) {
-                       pqi_invalid_response(ctrl_info);
+                       pqi_invalid_response(ctrl_info, PQI_EVENT_PI_OUT_OF_RANGE);
                        dev_err(&ctrl_info->pci_dev->dev,
                                "event interrupt: producer index (%u) out of range (0-%u): consumer index: %u\n",
                                oq_pi, PQI_NUM_EVENT_QUEUE_ELEMENTS - 1, oq_ci);
@@ -7323,7 +7325,10 @@ static void pqi_ctrl_update_feature_flags(struct pqi_ctrl_info *ctrl_info,
                ctrl_info->unique_wwid_in_report_phys_lun_supported =
                        firmware_feature->enabled;
                break;
+       case PQI_FIRMWARE_FEATURE_FW_TRIAGE:
+               ctrl_info->firmware_triage_supported = firmware_feature->enabled;
                pqi_save_fw_triage_setting(ctrl_info, firmware_feature->enabled);
+               break;
        }
 
        pqi_firmware_feature_status(ctrl_info, firmware_feature);
@@ -7419,6 +7424,11 @@ static struct pqi_firmware_feature pqi_firmware_features[] = {
                .feature_bit = PQI_FIRMWARE_FEATURE_UNIQUE_WWID_IN_REPORT_PHYS_LUN,
                .feature_status = pqi_ctrl_update_feature_flags,
        },
+       {
+               .feature_name = "Firmware Triage",
+               .feature_bit = PQI_FIRMWARE_FEATURE_FW_TRIAGE,
+               .feature_status = pqi_ctrl_update_feature_flags,
+       },
 };
 
 static void pqi_process_firmware_features(
@@ -7519,6 +7529,7 @@ static void pqi_ctrl_reset_config(struct pqi_ctrl_info *ctrl_info)
        ctrl_info->raid_iu_timeout_supported = false;
        ctrl_info->tmf_iu_timeout_supported = false;
        ctrl_info->unique_wwid_in_report_phys_lun_supported = false;
+       ctrl_info->firmware_triage_supported = false;
 }
 
 static int pqi_process_config_table(struct pqi_ctrl_info *ctrl_info)
@@ -8459,7 +8470,8 @@ static void pqi_ctrl_offline_worker(struct work_struct *work)
        pqi_take_ctrl_offline_deferred(ctrl_info);
 }
 
-static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info)
+static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info,
+       enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason)
 {
        if (!ctrl_info->controller_online)
                return;
@@ -8468,7 +8480,7 @@ static void pqi_take_ctrl_offline(struct pqi_ctrl_info *ctrl_info)
        ctrl_info->pqi_mode_enabled = false;
        pqi_ctrl_block_requests(ctrl_info);
        if (!pqi_disable_ctrl_shutdown)
-               sis_shutdown_ctrl(ctrl_info);
+               sis_shutdown_ctrl(ctrl_info, ctrl_shutdown_reason);
        pci_disable_device(ctrl_info->pci_dev);
        dev_err(&ctrl_info->pci_dev->dev, "controller offline\n");
        schedule_work(&ctrl_info->ctrl_offline_work);
@@ -9304,6 +9316,8 @@ static void __attribute__((unused)) verify_structures(void)
        BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
                sis_firmware_status) != 0xbc);
        BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
+               sis_ctrl_shutdown_reason_code) != 0xcc);
+       BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
                sis_mailbox) != 0x1000);
        BUILD_BUG_ON(offsetof(struct pqi_ctrl_registers,
                pqi_registers) != 0x4000);
index 8acd3a8..d66eb8e 100644 (file)
@@ -397,14 +397,17 @@ void sis_enable_intx(struct pqi_ctrl_info *ctrl_info)
        sis_set_doorbell_bit(ctrl_info, SIS_ENABLE_INTX);
 }
 
-void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info)
+void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info,
+       enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason)
 {
        if (readl(&ctrl_info->registers->sis_firmware_status) &
                SIS_CTRL_KERNEL_PANIC)
                return;
 
-       writel(SIS_TRIGGER_SHUTDOWN,
-               &ctrl_info->registers->sis_host_to_ctrl_doorbell);
+       if (ctrl_info->firmware_triage_supported)
+               writel(ctrl_shutdown_reason, &ctrl_info->registers->sis_ctrl_shutdown_reason_code);
+
+       writel(SIS_TRIGGER_SHUTDOWN, &ctrl_info->registers->sis_host_to_ctrl_doorbell);
 }
 
 int sis_pqi_reset_quiesce(struct pqi_ctrl_info *ctrl_info)
index c1db930..bd92ff4 100644 (file)
@@ -21,7 +21,8 @@ int sis_get_pqi_capabilities(struct pqi_ctrl_info *ctrl_info);
 int sis_init_base_struct_addr(struct pqi_ctrl_info *ctrl_info);
 void sis_enable_msix(struct pqi_ctrl_info *ctrl_info);
 void sis_enable_intx(struct pqi_ctrl_info *ctrl_info);
-void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info);
+void sis_shutdown_ctrl(struct pqi_ctrl_info *ctrl_info,
+       enum pqi_ctrl_shutdown_reason ctrl_shutdown_reason);
 int sis_pqi_reset_quiesce(struct pqi_ctrl_info *ctrl_info);
 int sis_reenable_sis_mode(struct pqi_ctrl_info *ctrl_info);
 void sis_write_driver_scratch(struct pqi_ctrl_info *ctrl_info, u32 value);