OSDN Git Service

RDMA/hns: Recover 1bit-ECC error of RAM on chip
authorHaoyue Xu <xuhaoyue1@hisilicon.com>
Thu, 14 Jul 2022 13:43:53 +0000 (21:43 +0800)
committerLeon Romanovsky <leonro@nvidia.com>
Mon, 18 Jul 2022 11:16:40 +0000 (14:16 +0300)
Since ECC memory maintains a memory system immune to single-bit errors,
add support for correcting the 1bit-ECC error, which prevents a 1bit-ECC
error become an uncorrected type error. When a 1bit-ECC error happens in
the internal ram of the ROCE engine, such as the QPC table, as a 1bit-ECC
error caused by reading, the ROCE engine only corrects those 1bit ECC
errors by writing.

Link: https://lore.kernel.org/r/20220714134353.16700-6-liangwenpeng@huawei.com
Signed-off-by: Haoyue Xu <xuhaoyue1@hisilicon.com>
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
drivers/infiniband/hw/hns/hns_roce_device.h
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.h

index 2855e9a..f848eed 100644 (file)
@@ -959,6 +959,7 @@ struct hns_roce_dev {
        const struct hns_roce_hw *hw;
        void                    *priv;
        struct workqueue_struct *irq_workq;
+       struct work_struct ecc_work;
        const struct hns_roce_dfx_hw *dfx;
        u32 func_num;
        u32 is_vf;
index 782f09a..cbdafaa 100644 (file)
@@ -55,6 +55,42 @@ enum {
        CMD_RST_PRC_EBUSY,
 };
 
+enum ecc_resource_type {
+       ECC_RESOURCE_QPC,
+       ECC_RESOURCE_CQC,
+       ECC_RESOURCE_MPT,
+       ECC_RESOURCE_SRQC,
+       ECC_RESOURCE_GMV,
+       ECC_RESOURCE_QPC_TIMER,
+       ECC_RESOURCE_CQC_TIMER,
+       ECC_RESOURCE_SCCC,
+       ECC_RESOURCE_COUNT,
+};
+
+static const struct {
+       const char *name;
+       u8 read_bt0_op;
+       u8 write_bt0_op;
+} fmea_ram_res[] = {
+       { "ECC_RESOURCE_QPC",
+         HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 },
+       { "ECC_RESOURCE_CQC",
+         HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 },
+       { "ECC_RESOURCE_MPT",
+         HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 },
+       { "ECC_RESOURCE_SRQC",
+         HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 },
+       /* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */
+       { "ECC_RESOURCE_GMV",
+         0, 0 },
+       { "ECC_RESOURCE_QPC_TIMER",
+         HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 },
+       { "ECC_RESOURCE_CQC_TIMER",
+         HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 },
+       { "ECC_RESOURCE_SCCC",
+         HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 },
+};
+
 static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
                                   struct ib_sge *sg)
 {
@@ -6017,6 +6053,142 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev,
        return IRQ_RETVAL(int_work);
 }
 
+static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev,
+                              struct fmea_ram_ecc *ecc_info)
+{
+       struct hns_roce_cmq_desc desc;
+       struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
+       int ret;
+
+       hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true);
+       ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+       if (ret)
+               return ret;
+
+       ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR);
+       ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE);
+       ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG);
+
+       return 0;
+}
+
+static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx)
+{
+       struct hns_roce_cmq_desc desc;
+       struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
+       u32 addr_upper;
+       u32 addr_low;
+       int ret;
+
+       hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true);
+       hr_reg_write(req, CFG_GMV_BT_IDX, idx);
+
+       ret = hns_roce_cmq_send(hr_dev, &desc, 1);
+       if (ret) {
+               dev_err(hr_dev->dev,
+                       "failed to execute cmd to read gmv, ret = %d.\n", ret);
+               return ret;
+       }
+
+       addr_low =  hr_reg_read(req, CFG_GMV_BT_BA_L);
+       addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H);
+
+       hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false);
+       hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low);
+       hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper);
+       hr_reg_write(req, CFG_GMV_BT_IDX, idx);
+
+       return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
+static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data)
+{
+       if (res_type == ECC_RESOURCE_QPC_TIMER ||
+           res_type == ECC_RESOURCE_CQC_TIMER ||
+           res_type == ECC_RESOURCE_SCCC)
+               return le64_to_cpu(*data);
+
+       return le64_to_cpu(*data) << PAGE_SHIFT;
+}
+
+static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type,
+                              u32 index)
+{
+       u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op;
+       u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op;
+       struct hns_roce_cmd_mailbox *mailbox;
+       u64 addr;
+       int ret;
+
+       mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+
+       ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index);
+       if (ret) {
+               dev_err(hr_dev->dev,
+                       "failed to execute cmd to read fmea ram, ret = %d.\n",
+                       ret);
+               goto out;
+       }
+
+       addr = fmea_get_ram_res_addr(res_type, mailbox->buf);
+
+       ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index);
+       if (ret)
+               dev_err(hr_dev->dev,
+                       "failed to execute cmd to write fmea ram, ret = %d.\n",
+                       ret);
+
+out:
+       hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+       return ret;
+}
+
+static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev,
+                                struct fmea_ram_ecc *ecc_info)
+{
+       u32 res_type = ecc_info->res_type;
+       u32 index = ecc_info->index;
+       int ret;
+
+       BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT);
+
+       if (res_type >= ECC_RESOURCE_COUNT) {
+               dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n",
+                       res_type);
+               return;
+       }
+
+       if (res_type == ECC_RESOURCE_GMV)
+               ret = fmea_recover_gmv(hr_dev, index);
+       else
+               ret = fmea_recover_others(hr_dev, res_type, index);
+       if (ret)
+               dev_err(hr_dev->dev,
+                       "failed to recover %s, index = %u, ret = %d.\n",
+                       fmea_ram_res[res_type].name, index, ret);
+}
+
+static void fmea_ram_ecc_work(struct work_struct *ecc_work)
+{
+       struct hns_roce_dev *hr_dev =
+               container_of(ecc_work, struct hns_roce_dev, ecc_work);
+       struct fmea_ram_ecc ecc_info = {};
+
+       if (fmea_ram_ecc_query(hr_dev, &ecc_info)) {
+               dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n");
+               return;
+       }
+
+       if (!ecc_info.is_ecc_err) {
+               dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n");
+               return;
+       }
+
+       fmea_ram_ecc_recover(hr_dev, &ecc_info);
+}
+
 static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
 {
        struct hns_roce_dev *hr_dev = dev_id;
@@ -6025,10 +6197,14 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
 
        int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
 
-       if (int_st)
+       if (int_st) {
                int_work = abnormal_interrupt_basic(hr_dev, int_st);
-       else
+       } else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
+               queue_work(hr_dev->irq_workq, &hr_dev->ecc_work);
+               int_work = IRQ_HANDLED;
+       } else {
                dev_err(hr_dev->dev, "there is no abnormal irq found.\n");
+       }
 
        return IRQ_RETVAL(int_work);
 }
@@ -6344,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
                }
        }
 
+       INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work);
+
        hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0);
        if (!hr_dev->irq_workq) {
                dev_err(dev, "failed to create irq workqueue.\n");
index e618614..f96deba 100644 (file)
@@ -250,6 +250,7 @@ enum hns_roce_opcode_type {
        HNS_ROCE_OPC_CFG_GMV_TBL                        = 0x850f,
        HNS_ROCE_OPC_CFG_GMV_BT                         = 0x8510,
        HNS_ROCE_OPC_EXT_CFG                            = 0x8512,
+       HNS_ROCE_QUERY_RAM_ECC                          = 0x8513,
        HNS_SWITCH_PARAMETER_CFG                        = 0x1033,
 };
 
@@ -1107,6 +1108,11 @@ enum {
 #define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32)
 #define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64)
 
+/* Fields of HNS_ROCE_QUERY_RAM_ECC */
+#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0)
+#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32)
+#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64)
+
 struct hns_roce_cfg_sgid_tb {
        __le32  table_idx_rsv;
        __le32  vf_sgid_l;
@@ -1343,6 +1349,12 @@ struct hns_roce_dip {
        struct list_head node; /* all dips are on a list */
 };
 
+struct fmea_ram_ecc {
+       u32     is_ecc_err;
+       u32     res_type;
+       u32     index;
+};
+
 /* only for RNR timeout issue of HIP08 */
 #define HNS_ROCE_CLOCK_ADJUST 1000
 #define HNS_ROCE_MAX_CQ_PERIOD 65