OSDN Git Service

IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs
authorKaike Wan <kaike.wan@intel.com>
Thu, 24 Jan 2019 05:51:39 +0000 (21:51 -0800)
committerDoug Ledford <dledford@redhat.com>
Tue, 5 Feb 2019 23:07:44 +0000 (18:07 -0500)
This patch integrates TID RDMA WRITE protocol into normal RDMA verbs
framework. The TID RDMA WRITE protocol is an end-to-end protocol
between the hfi1 drivers on two OPA nodes that converts a qualified
RDMA WRITE request into a TID RDMA WRITE request to avoid data copying
on the responder side.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
drivers/infiniband/hw/hfi1/qp.c
drivers/infiniband/hw/hfi1/rc.c
drivers/infiniband/hw/hfi1/tid_rdma.c
drivers/infiniband/hw/hfi1/user_sdma.c
drivers/infiniband/hw/hfi1/verbs.c
drivers/infiniband/hw/hfi1/verbs.h
include/rdma/rdmavt_qp.h

index 96632c7..cfd598e 100644 (file)
@@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
        .flags = RVT_OPERATION_USE_RESERVE,
 },
 
+[IB_WR_TID_RDMA_WRITE] = {
+       .length = sizeof(struct ib_rdma_wr),
+       .qpt_support = BIT(IB_QPT_RC),
+       .flags = RVT_OPERATION_IGN_RNR_CNT,
+},
+
 };
 
 static void flush_list_head(struct list_head *l)
@@ -780,6 +786,7 @@ void quiesce_qp(struct rvt_qp *qp)
        struct hfi1_qp_priv *priv = qp->priv;
 
        hfi1_del_tid_reap_timer(qp);
+       hfi1_del_tid_retry_timer(qp);
        iowait_sdma_drain(&priv->s_iowait);
        qp_pio_drain(qp);
        flush_tx_list(qp);
index fcb733e..6d2abea 100644 (file)
@@ -111,16 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
                       struct hfi1_pkt_state *ps)
 {
        struct rvt_ack_entry *e;
-       u32 hwords;
+       u32 hwords, hdrlen;
        u32 len = 0;
        u32 bth0 = 0, bth2 = 0;
        u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
        int middle = 0;
        u32 pmtu = qp->pmtu;
-       struct hfi1_qp_priv *priv = qp->priv;
+       struct hfi1_qp_priv *qpriv = qp->priv;
        bool last_pkt;
        u32 delta;
        u8 next = qp->s_tail_ack_queue;
+       struct tid_rdma_request *req;
 
        trace_hfi1_rsp_make_rc_ack(qp, 0);
        lockdep_assert_held(&qp->s_lock);
@@ -128,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
                goto bail;
 
-       if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+       if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
                /* header size in 32-bit words LRH+BTH = (8+12)/4. */
                hwords = 5;
        else
@@ -206,6 +207,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
                        hwords++;
                        qp->s_ack_rdma_psn = e->psn;
                        bth2 = mask_psn(qp->s_ack_rdma_psn++);
+               } else if (e->opcode == TID_OP(WRITE_REQ)) {
+                       /*
+                        * If a TID RDMA WRITE RESP is being resent, we have to
+                        * wait for the actual request. All requests that are to
+                        * be resent will have their state set to
+                        * TID_REQUEST_RESEND. When the new request arrives, the
+                        * state will be changed to TID_REQUEST_RESEND_ACTIVE.
+                        */
+                       req = ack_to_tid_req(e);
+                       if (req->state == TID_REQUEST_RESEND ||
+                           req->state == TID_REQUEST_INIT_RESEND)
+                               goto bail;
+                       qp->s_ack_state = TID_OP(WRITE_RESP);
+                       qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
+                       goto write_resp;
                } else if (e->opcode == TID_OP(READ_REQ)) {
                        /*
                         * If a TID RDMA read response is being resent and
@@ -267,6 +283,59 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
                bth2 = mask_psn(qp->s_ack_rdma_psn++);
                break;
 
+       case TID_OP(WRITE_RESP):
+write_resp:
+               /*
+                * 1. Check if RVT_S_ACK_PENDING is set. If yes,
+                *    goto normal.
+                * 2. Attempt to allocate TID resources.
+                * 3. Remove RVT_S_RESP_PENDING flags from s_flags
+                * 4. If resources not available:
+                *    4.1 Set RVT_S_WAIT_TID_SPACE
+                *    4.2 Queue QP on RCD TID queue
+                *    4.3 Put QP on iowait list.
+                *    4.4 Build IB RNR NAK with appropriate timeout value
+                *    4.5 Return indication progress made.
+                * 5. If resources are available:
+                *    5.1 Program HW flow CSRs
+                *    5.2 Build TID RDMA WRITE RESP packet
+                *    5.3 If more resources needed, do 2.1 - 2.3.
+                *    5.4 Wake up next QP on RCD TID queue.
+                *    5.5 Return indication progress made.
+                */
+
+               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+               req = ack_to_tid_req(e);
+
+               /*
+                * Send scheduled RNR NAK's. RNR NAK's need to be sent at
+                * segment boundaries, not at request boundaries. Don't change
+                * s_ack_state because we are still in the middle of a request
+                */
+               if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
+                   qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
+                   req->cur_seg == req->alloc_seg) {
+                       qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
+                       goto normal_no_state;
+               }
+
+               bth2 = mask_psn(qp->s_ack_rdma_psn);
+               hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
+                                                       bth2, &len,
+                                                       &ps->s_txreq->ss);
+               if (!hdrlen)
+                       return 0;
+
+               hwords += hdrlen;
+               bth0 = qp->s_ack_state << 24;
+               qp->s_ack_rdma_psn++;
+               if (req->cur_seg != req->total_segs)
+                       break;
+
+               e->sent = 1;
+               qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+               break;
+
        case TID_OP(READ_RESP):
 read_resp:
                e = &qp->s_ack_queue[qp->s_tail_ack_queue];
@@ -298,8 +367,7 @@ normal:
                 * (see above).
                 */
                qp->s_ack_state = OP(SEND_ONLY);
-               qp->s_flags &= ~RVT_S_ACK_PENDING;
-               ps->s_txreq->ss = NULL;
+normal_no_state:
                if (qp->s_nak_state)
                        ohdr->u.aeth =
                                cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -311,9 +379,11 @@ normal:
                len = 0;
                bth0 = OP(ACKNOWLEDGE) << 24;
                bth2 = mask_psn(qp->s_ack_psn);
+               qp->s_flags &= ~RVT_S_ACK_PENDING;
+               ps->s_txreq->ss = NULL;
        }
        qp->s_rdma_ack_cnt++;
-       ps->s_txreq->sde = priv->s_sde;
+       ps->s_txreq->sde = qpriv->s_sde;
        ps->s_txreq->s_cur_size = len;
        ps->s_txreq->hdr_dwords = hwords;
        hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
@@ -366,6 +436,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
        int middle = 0;
        int delta;
        struct tid_rdma_flow *flow = NULL;
+       struct tid_rdma_params *remote;
 
        trace_hfi1_sender_make_rc_req(qp);
        lockdep_assert_held(&qp->s_lock);
@@ -414,7 +485,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
                goto done_free_tx;
        }
 
-       if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+       if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
                goto bail;
 
        if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -586,6 +657,108 @@ no_flow_control:
                                qp->s_cur = 0;
                        break;
 
+               case IB_WR_TID_RDMA_WRITE:
+                       if (newreq) {
+                               /*
+                                * Limit the number of TID RDMA WRITE requests.
+                                */
+                               if (atomic_read(&priv->n_tid_requests) >=
+                                   HFI1_TID_RDMA_WRITE_CNT)
+                                       goto bail;
+
+                               if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+                                       qp->s_lsn++;
+                       }
+
+                       hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
+                                                               &bth1, &bth2,
+                                                               &len);
+                       ss = NULL;
+                       if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
+                               priv->s_tid_cur = qp->s_cur;
+                               if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
+                                       priv->s_tid_tail = qp->s_cur;
+                                       priv->s_state = TID_OP(WRITE_RESP);
+                               }
+                       } else if (priv->s_tid_cur == priv->s_tid_head) {
+                               struct rvt_swqe *__w;
+                               struct tid_rdma_request *__r;
+
+                               __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+                               __r = wqe_to_tid_req(__w);
+
+                               /*
+                                * The s_tid_cur pointer is advanced to s_cur if
+                                * any of the following conditions about the WQE
+                                * to which s_ti_cur currently points to are
+                                * satisfied:
+                                *   1. The request is not a TID RDMA WRITE
+                                *      request,
+                                *   2. The request is in the INACTIVE or
+                                *      COMPLETE states (TID RDMA READ requests
+                                *      stay at INACTIVE and TID RDMA WRITE
+                                *      transition to COMPLETE when done),
+                                *   3. The request is in the ACTIVE or SYNC
+                                *      state and the number of completed
+                                *      segments is equal to the total segment
+                                *      count.
+                                *      (If ACTIVE, the request is waiting for
+                                *       ACKs. If SYNC, the request has not
+                                *       received any responses because it's
+                                *       waiting on a sync point.)
+                                */
+                               if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
+                                   __r->state == TID_REQUEST_INACTIVE ||
+                                   __r->state == TID_REQUEST_COMPLETE ||
+                                   ((__r->state == TID_REQUEST_ACTIVE ||
+                                     __r->state == TID_REQUEST_SYNC) &&
+                                    __r->comp_seg == __r->total_segs)) {
+                                       if (priv->s_tid_tail ==
+                                           priv->s_tid_cur &&
+                                           priv->s_state ==
+                                           TID_OP(WRITE_DATA_LAST)) {
+                                               priv->s_tid_tail = qp->s_cur;
+                                               priv->s_state =
+                                                       TID_OP(WRITE_RESP);
+                                       }
+                                       priv->s_tid_cur = qp->s_cur;
+                               }
+                               /*
+                                * A corner case: when the last TID RDMA WRITE
+                                * request was completed, s_tid_head,
+                                * s_tid_cur, and s_tid_tail all point to the
+                                * same location. Other requests are posted and
+                                * s_cur wraps around to the same location,
+                                * where a new TID RDMA WRITE is posted. In
+                                * this case, none of the indices need to be
+                                * updated. However, the priv->s_state should.
+                                */
+                               if (priv->s_tid_tail == qp->s_cur &&
+                                   priv->s_state == TID_OP(WRITE_DATA_LAST))
+                                       priv->s_state = TID_OP(WRITE_RESP);
+                       }
+                       req = wqe_to_tid_req(wqe);
+                       if (newreq) {
+                               priv->s_tid_head = qp->s_cur;
+                               priv->pending_tid_w_resp += req->total_segs;
+                               atomic_inc(&priv->n_tid_requests);
+                               atomic_dec(&priv->n_requests);
+                       } else {
+                               req->state = TID_REQUEST_RESEND;
+                               req->comp_seg = delta_psn(bth2, wqe->psn);
+                               /*
+                                * Pull back any segments since we are going
+                                * to re-receive them.
+                                */
+                               req->setup_head = req->clear_tail;
+                               priv->pending_tid_w_resp +=
+                                       delta_psn(wqe->lpsn, bth2) + 1;
+                       }
+
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
                case IB_WR_RDMA_READ:
                        /*
                         * Don't allow more operations to be started
@@ -745,7 +918,8 @@ no_flow_control:
                        if (qp->s_tail >= qp->s_size)
                                qp->s_tail = 0;
                }
-               if (wqe->wr.opcode == IB_WR_RDMA_READ)
+               if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+                   wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
                        qp->s_psn = wqe->lpsn + 1;
                else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
                        qp->s_psn = req->s_next_psn;
@@ -865,6 +1039,33 @@ no_flow_control:
                if (qp->s_cur == qp->s_size)
                        qp->s_cur = 0;
                break;
+
+       case TID_OP(WRITE_RESP):
+               /*
+                * This value for s_state is used for restarting a TID RDMA
+                * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
+                * for more).
+                */
+               req = wqe_to_tid_req(wqe);
+               req->state = TID_REQUEST_RESEND;
+               rcu_read_lock();
+               remote = rcu_dereference(priv->tid_rdma.remote);
+               req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
+               len = wqe->length - (req->comp_seg * remote->max_len);
+               rcu_read_unlock();
+
+               bth2 = mask_psn(qp->s_psn);
+               hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
+                                                       &bth2, &len);
+               qp->s_psn = wqe->lpsn + 1;
+               ss = NULL;
+               qp->s_state = TID_OP(WRITE_REQ);
+               priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
+               priv->s_tid_cur = qp->s_cur;
+               if (++qp->s_cur == qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
        case TID_OP(READ_RESP):
                if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
                        goto bail;
@@ -965,7 +1166,8 @@ no_flow_control:
        }
        qp->s_sending_hpsn = bth2;
        delta = delta_psn(bth2, wqe->psn);
-       if (delta && delta % HFI1_PSN_CREDIT == 0)
+       if (delta && delta % HFI1_PSN_CREDIT == 0 &&
+           wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
                bth2 |= IB_BTH_REQ_ACK;
        if (qp->s_flags & RVT_S_SEND_ONE) {
                qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -998,6 +1200,12 @@ bail:
 bail_no_tx:
        ps->s_txreq = NULL;
        qp->s_flags &= ~RVT_S_BUSY;
+       /*
+        * If we didn't get a txreq, the QP will be woken up later to try
+        * again. Set the flags to indicate which work item to wake
+        * up.
+        */
+       iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
        return 0;
 }
 
@@ -1285,6 +1493,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
        lockdep_assert_held(&qp->s_lock);
        qp->s_cur = n;
        priv->pending_tid_r_segs = 0;
+       priv->pending_tid_w_resp = 0;
        qp->s_num_rd_atomic = 0;
 
        /*
@@ -1342,6 +1551,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
                qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
                break;
 
+       case IB_WR_TID_RDMA_WRITE:
+               qp->s_state = TID_OP(WRITE_RESP);
+               break;
+
        case IB_WR_RDMA_READ:
                qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
                break;
@@ -1435,7 +1648,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 
        qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
                         RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
-                        RVT_S_WAIT_ACK);
+                        RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
        if (wait)
                qp->s_flags |= RVT_S_SEND_ONE;
        reset_psn(qp, psn);
@@ -1443,7 +1656,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 
 /*
  * Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
+ * are present.
  */
 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
 {
@@ -1456,7 +1670,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
                wqe = rvt_get_swqe_ptr(qp, n);
                if (cmp_psn(psn, wqe->lpsn) <= 0) {
                        if (wqe->wr.opcode == IB_WR_RDMA_READ ||
-                           wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+                           wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+                           wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
                                qp->s_sending_psn = wqe->lpsn + 1;
                        else
                                qp->s_sending_psn = psn + 1;
@@ -1479,8 +1694,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
        struct rvt_swqe *wqe;
        struct ib_header *hdr = NULL;
        struct hfi1_16b_header *hdr_16b = NULL;
-       u32 opcode;
+       u32 opcode, head, tail;
        u32 psn;
+       struct tid_rdma_request *req;
 
        lockdep_assert_held(&qp->s_lock);
        if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
@@ -1507,29 +1723,84 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
        opcode = ib_bth_get_opcode(ohdr);
        if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
             opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
-           opcode == TID_OP(READ_RESP)) {
+           opcode == TID_OP(READ_RESP) ||
+           opcode == TID_OP(WRITE_RESP)) {
                WARN_ON(!qp->s_rdma_ack_cnt);
                qp->s_rdma_ack_cnt--;
                return;
        }
 
        psn = ib_bth_get_psn(ohdr);
-       reset_sending_psn(qp, psn);
+       /*
+        * Don't attempt to reset the sending PSN for packets in the
+        * KDETH PSN space since the PSN does not match anything.
+        */
+       if (opcode != TID_OP(WRITE_DATA) &&
+           opcode != TID_OP(WRITE_DATA_LAST) &&
+           opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
+               reset_sending_psn(qp, psn);
+
+       /* Handle TID RDMA WRITE packets differently */
+       if (opcode >= TID_OP(WRITE_REQ) &&
+           opcode <= TID_OP(WRITE_DATA_LAST)) {
+               head = priv->s_tid_head;
+               tail = priv->s_tid_cur;
+               /*
+                * s_tid_cur is set to s_tid_head in the case, where
+                * a new TID RDMA request is being started and all
+                * previous ones have been completed.
+                * Therefore, we need to do a secondary check in order
+                * to properly determine whether we should start the
+                * RC timer.
+                */
+               wqe = rvt_get_swqe_ptr(qp, tail);
+               req = wqe_to_tid_req(wqe);
+               if (head == tail && req->comp_seg < req->total_segs) {
+                       if (tail == 0)
+                               tail = qp->s_size - 1;
+                       else
+                               tail -= 1;
+               }
+       } else {
+               head = qp->s_tail;
+               tail = qp->s_acked;
+       }
 
        /*
         * Start timer after a packet requesting an ACK has been sent and
         * there are still requests that haven't been acked.
         */
-       if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+       if ((psn & IB_BTH_REQ_ACK) && tail != head &&
+           opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
+           opcode != TID_OP(RESYNC) &&
            !(qp->s_flags &
-               (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
-               (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+             (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+           (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
                if (opcode == TID_OP(READ_REQ))
                        rvt_add_retry_timer_ext(qp, priv->timeout_shift);
                else
                        rvt_add_retry_timer(qp);
        }
 
+       /* Start TID RDMA ACK timer */
+       if ((opcode == TID_OP(WRITE_DATA) ||
+            opcode == TID_OP(WRITE_DATA_LAST) ||
+            opcode == TID_OP(RESYNC)) &&
+           (psn & IB_BTH_REQ_ACK) &&
+           !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
+           (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+               /*
+                * The TID RDMA ACK packet could be received before this
+                * function is called. Therefore, add the timer only if TID
+                * RDMA ACK packets are actually pending.
+                */
+               wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+               req = wqe_to_tid_req(wqe);
+               if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+                   req->ack_seg < req->cur_seg)
+                       hfi1_add_tid_retry_timer(qp);
+       }
+
        while (qp->s_last != qp->s_acked) {
                u32 s_last;
 
@@ -1628,7 +1899,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
        }
 
        qp->s_retry = qp->s_retry_cnt;
-       update_last_psn(qp, wqe->lpsn);
+       /*
+        * Don't update the last PSN if the request being completed is
+        * a TID RDMA WRITE request.
+        * Completion of the TID RDMA WRITE requests are done by the
+        * TID RDMA ACKs and as such could be for a request that has
+        * already been ACKed as far as the IB state machine is
+        * concerned.
+        */
+       if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+               update_last_psn(qp, wqe->lpsn);
 
        /*
         * If we are completing a request which is in the process of
@@ -1658,6 +1938,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
        return wqe;
 }
 
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
+{
+       /* Retry this request. */
+       if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+               qp->r_flags |= RVT_R_RDMAR_SEQ;
+               hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+               if (list_empty(&qp->rspwait)) {
+                       qp->r_flags |= RVT_R_RSP_SEND;
+                       rvt_get_qp(qp);
+                       list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+               }
+       }
+}
+
+/**
+ * update_qp_retry_state - Update qp retry state.
+ * @qp: the QP
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
+ * @spsn:  The start psn for the given TID RDMA WRITE swqe.
+ * @lpsn:  The last psn for the given TID RDMA WRITE swqe.
+ *
+ * This function is called to update the qp retry state upon
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
+ * a request.
+ */
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
+                                 u32 lpsn)
+{
+       struct hfi1_qp_priv *qpriv = qp->priv;
+
+       qp->s_psn = psn + 1;
+       /*
+        * If this is the first TID RDMA WRITE RESP packet for the current
+        * request, change the s_state so that the retry will be processed
+        * correctly. Similarly, if this is the last TID RDMA WRITE RESP
+        * packet, change the s_state and advance the s_cur.
+        */
+       if (cmp_psn(psn, lpsn) >= 0) {
+               qp->s_cur = qpriv->s_tid_cur + 1;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               qp->s_state = TID_OP(WRITE_REQ);
+       } else  if (!cmp_psn(psn, spsn)) {
+               qp->s_cur = qpriv->s_tid_cur;
+               qp->s_state = TID_OP(WRITE_RESP);
+       }
+}
+
 /**
  * do_rc_ack - process an incoming RC ACK
  * @qp: the QP the ACK came in on
@@ -1679,6 +2007,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
        int ret = 0;
        u32 ack_psn;
        int diff;
+       struct rvt_dev_info *rdi;
 
        lockdep_assert_held(&qp->s_lock);
        /*
@@ -1725,18 +2054,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
                     (opcode != TID_OP(READ_RESP) || diff != 0)) ||
                    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
                      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-                       /* Retry this request. */
-                       if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
-                               qp->r_flags |= RVT_R_RDMAR_SEQ;
-                               hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
-                               if (list_empty(&qp->rspwait)) {
-                                       qp->r_flags |= RVT_R_RSP_SEND;
-                                       rvt_get_qp(qp);
-                                       list_add_tail(&qp->rspwait,
-                                                     &rcd->qp_wait_list);
-                               }
-                       }
+                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
+                   (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+                    (delta_psn(psn, qp->s_last_psn) != 1))) {
+                       set_restart_qp(qp, rcd);
                        /*
                         * No need to process the ACK/NAK since we are
                         * restarting an earlier request.
@@ -1768,6 +2089,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
                                hfi1_schedule_send(qp);
                        }
                }
+
+               /*
+                * TID RDMA WRITE requests will be completed by the TID RDMA
+                * ACK packet handler (see tid_rdma.c).
+                */
+               if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+                       break;
+
                wqe = do_rc_completion(qp, wqe, ibp);
                if (qp->s_acked == qp->s_tail)
                        break;
@@ -1785,17 +2114,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
                        else
                                rvt_stop_rc_timers(qp);
                } else if (qp->s_acked != qp->s_tail) {
+                       struct rvt_swqe *__w = NULL;
+
+                       if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
+                               __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+
                        /*
-                        * We are expecting more ACKs so
-                        * mod the retry timer.
-                        */
-                       rvt_mod_retry_timer(qp);
-                       /*
-                        * We can stop re-sending the earlier packets and
-                        * continue with the next packet the receiver wants.
+                        * Stop timers if we've received all of the TID RDMA
+                        * WRITE * responses.
                         */
-                       if (cmp_psn(qp->s_psn, psn) <= 0)
-                               reset_psn(qp, psn + 1);
+                       if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+                           opcode == TID_OP(WRITE_RESP)) {
+                               /*
+                                * Normally, the loop above would correctly
+                                * process all WQEs from s_acked onward and
+                                * either complete them or check for correct
+                                * PSN sequencing.
+                                * However, for TID RDMA, due to pipelining,
+                                * the response may not be for the request at
+                                * s_acked so the above look would just be
+                                * skipped. This does not allow for checking
+                                * the PSN sequencing. It has to be done
+                                * separately.
+                                */
+                               if (cmp_psn(psn, qp->s_last_psn + 1)) {
+                                       set_restart_qp(qp, rcd);
+                                       goto bail_stop;
+                               }
+                               /*
+                                * If the psn is being resent, stop the
+                                * resending.
+                                */
+                               if (qp->s_cur != qp->s_tail &&
+                                   cmp_psn(qp->s_psn, psn) <= 0)
+                                       update_qp_retry_state(qp, psn,
+                                                             __w->psn,
+                                                             __w->lpsn);
+                               else if (--qpriv->pending_tid_w_resp)
+                                       rvt_mod_retry_timer(qp);
+                               else
+                                       rvt_stop_rc_timers(qp);
+                       } else {
+                               /*
+                                * We are expecting more ACKs so
+                                * mod the retry timer.
+                                */
+                               rvt_mod_retry_timer(qp);
+                               /*
+                                * We can stop re-sending the earlier packets
+                                * and continue with the next packet the
+                                * receiver wants.
+                                */
+                               if (cmp_psn(qp->s_psn, psn) <= 0)
+                                       reset_psn(qp, psn + 1);
+                       }
                } else {
                        /* No more acks - kill all timers */
                        rvt_stop_rc_timers(qp);
@@ -1811,6 +2183,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
                rvt_get_credit(qp, aeth);
                qp->s_rnr_retry = qp->s_rnr_retry_cnt;
                qp->s_retry = qp->s_retry_cnt;
+               /*
+                * If the current request is a TID RDMA WRITE request and the
+                * response is not a TID RDMA WRITE RESP packet, s_last_psn
+                * can't be advanced.
+                */
+               if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+                   opcode != TID_OP(WRITE_RESP) &&
+                   cmp_psn(psn, wqe->psn) >= 0)
+                       return 1;
                update_last_psn(qp, psn);
                return 1;
 
@@ -1820,20 +2201,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
                        goto bail_stop;
                if (qp->s_flags & RVT_S_WAIT_RNR)
                        goto bail_stop;
-               if (qp->s_rnr_retry == 0) {
+               rdi = ib_to_rvt(qp->ibqp.device);
+               if (qp->s_rnr_retry == 0 &&
+                   !((rdi->post_parms[wqe->wr.opcode].flags &
+                     RVT_OPERATION_IGN_RNR_CNT) &&
+                     qp->s_rnr_retry_cnt == 0)) {
                        status = IB_WC_RNR_RETRY_EXC_ERR;
                        goto class_b;
                }
-               if (qp->s_rnr_retry_cnt < 7)
+               if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
                        qp->s_rnr_retry--;
 
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
+               /*
+                * The last valid PSN is the previous PSN. For TID RDMA WRITE
+                * request, s_last_psn should be incremented only when a TID
+                * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
+                * WRITE RESP packets.
+                */
+               if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+                       reset_psn(qp, qp->s_last_psn + 1);
+               } else {
+                       update_last_psn(qp, psn - 1);
+                       reset_psn(qp, psn);
+               }
 
                ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
-               reset_psn(qp, psn);
-
                qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
                rvt_stop_rc_timers(qp);
                rvt_add_rnr_timer(qp, aeth);
@@ -1918,6 +2310,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
        while (cmp_psn(psn, wqe->lpsn) > 0) {
                if (wqe->wr.opcode == IB_WR_RDMA_READ ||
                    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+                   wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
                        break;
index 80111dd..490e47a 100644 (file)
@@ -3208,6 +3208,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
                        ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
                } while (!ret);
        }
+       for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
+               struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+               if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
+                       i = 0;
+               /* Free only locally allocated TID entries */
+               if (e->opcode != TID_OP(WRITE_REQ))
+                       continue;
+               do {
+                       struct hfi1_ack_priv *priv = e->priv;
+
+                       ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+               } while (!ret);
+       }
 }
 
 bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
index e5e7fad..6764114 100644 (file)
@@ -1126,7 +1126,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
                        0xffffffull),
                psn = val & mask;
        if (expct)
-               psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+               psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
+                       ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
        else
                psn = psn + frags;
        return psn & mask;
index 7b87b77..ab97d71 100644 (file)
@@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the
  */
 const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
        [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+       [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
        [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
        [IB_WR_SEND] = IB_WC_SEND,
        [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
@@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = {
        [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
        [IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
        [IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
+       [IB_OPCODE_TID_RDMA_WRITE_REQ]                = 12 + 8 + 36,
+       [IB_OPCODE_TID_RDMA_WRITE_RESP]               = 12 + 8 + 36,
+       [IB_OPCODE_TID_RDMA_WRITE_DATA]               = 12 + 8 + 36,
+       [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST]          = 12 + 8 + 36,
+       [IB_OPCODE_TID_RDMA_ACK]                      = 12 + 8 + 36,
+       [IB_OPCODE_TID_RDMA_RESYNC]                   = 12 + 8 + 36,
        /* UC */
        [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
        [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
@@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = {
        [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 
        /* TID RDMA has separate handlers for different opcodes.*/
+       [IB_OPCODE_TID_RDMA_WRITE_REQ]       = &hfi1_rc_rcv_tid_rdma_write_req,
+       [IB_OPCODE_TID_RDMA_WRITE_RESP]      = &hfi1_rc_rcv_tid_rdma_write_resp,
+       [IB_OPCODE_TID_RDMA_WRITE_DATA]      = &hfi1_rc_rcv_tid_rdma_write_data,
+       [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
        [IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
        [IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
+       [IB_OPCODE_TID_RDMA_RESYNC]          = &hfi1_rc_rcv_tid_rdma_resync,
+       [IB_OPCODE_TID_RDMA_ACK]             = &hfi1_rc_rcv_tid_rdma_ack,
 
        /* UC */
        [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
@@ -1332,7 +1345,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
        rdi->dparms.props.max_mr_size = U64_MAX;
        rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
        rdi->dparms.props.max_qp = hfi1_max_qps;
-       rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+       rdi->dparms.props.max_qp_wr =
+               (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
+                HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
        rdi->dparms.props.max_send_sge = hfi1_max_sges;
        rdi->dparms.props.max_recv_sge = hfi1_max_sges;
        rdi->dparms.props.max_sge_rd = hfi1_max_sges;
index bee3d21..62ace0b 100644 (file)
@@ -193,6 +193,7 @@ struct hfi1_qp_priv {
        u32 r_tid_ack;      /* the TID RDMA request to be ACK'ed */
        u32 r_tid_alloc;    /* Request for which we are allocating resources */
        u32 pending_tid_w_segs; /* Num of pending tid write segments */
+       u32 pending_tid_w_resp; /* Num of pending tid write responses */
        u32 alloc_w_segs;       /* Number of segments for which write */
                               /* resources have been allocated for this QP */
 
index 4ee612a..f0fbd40 100644 (file)
@@ -246,6 +246,7 @@ struct rvt_ack_entry {
 #define RVT_OPERATION_ATOMIC_SGE  0x00000004
 #define RVT_OPERATION_LOCAL       0x00000008
 #define RVT_OPERATION_USE_RESERVE 0x00000010
+#define RVT_OPERATION_IGN_RNR_CNT 0x00000020
 
 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)