/** * rvt_poll_cq - poll for work completion entries * @ibcq: the completion queue to poll * @num_entries: the maximum number of entries to return * @entry: pointer to array where work completions are placed * * This may be called from interrupt context. Also called by ib_poll_cq() * in the generic verbs code. * * Return: the number of completion entries polled. */ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_cq_wc *wc; unsigned long flags; int npolled; u32 tail; /* The kernel can only poll a kernel completion queue */ if (cq->ip) return -EINVAL; spin_lock_irqsave(&cq->lock, flags); wc = cq->queue; tail = wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { if (tail == wc->head) break; /* The kernel doesn't need a RMB since it has the lock. */ *entry = wc->kqueue[tail]; if (tail >= cq->ibcq.cqe) tail = 0; else tail++; } wc->tail = tail; spin_unlock_irqrestore(&cq->lock, flags); return npolled; }
/** * rvt_post_receive - post a receive on a QP * @ibqp: the QP to post the receive on * @wr: the WR to post * @bad_wr: the first bad WR is put here * * This may be called from interrupt context. * * Return: 0 on success otherwise errno */ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr) { struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); struct rvt_rwq *wq = qp->r_rq.wq; unsigned long flags; int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) && !qp->ibqp.srq; /* Check that state is OK to post receive. */ if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) || !wq) { *bad_wr = wr; return -EINVAL; } for (; wr; wr = wr->next) { struct rvt_rwqe *wqe; u32 next; int i; if ((unsigned)wr->num_sge > qp->r_rq.max_sge) { *bad_wr = wr; return -EINVAL; } spin_lock_irqsave(&qp->r_rq.lock, flags); next = wq->head + 1; if (next >= qp->r_rq.size) next = 0; if (next == wq->tail) { spin_unlock_irqrestore(&qp->r_rq.lock, flags); *bad_wr = wr; return -ENOMEM; } if (unlikely(qp_err_flush)) { struct ib_wc wc; memset(&wc, 0, sizeof(wc)); wc.qp = &qp->ibqp; wc.opcode = IB_WC_RECV; wc.wr_id = wr->wr_id; wc.status = IB_WC_WR_FLUSH_ERR; rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); } else { wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; for (i = 0; i < wr->num_sge; i++) wqe->sg_list[i] = wr->sg_list[i]; /* * Make sure queue entry is written * before the head index. */ smp_wmb(); wq->head = next; } spin_unlock_irqrestore(&qp->r_rq.lock, flags); } return 0; }
/* * Validate a RWQE and fill in the SGE state. * Return 1 if OK. */ static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) { int i, j, ret; struct ib_wc wc; struct rvt_lkey_table *rkt; struct rvt_pd *pd; struct rvt_sge_state *ss; rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table; pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); ss = &qp->r_sge; ss->sg_list = qp->r_sg_list; qp->r_len = 0; for (i = j = 0; i < wqe->num_sge; i++) { if (wqe->sg_list[i].length == 0) continue; /* Check LKEY */ ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, NULL, &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE); if (unlikely(ret <= 0)) goto bad_lkey; qp->r_len += wqe->sg_list[i].length; j++; } ss->num_sge = j; ss->total_len = qp->r_len; ret = 1; goto bail; bad_lkey: while (j) { struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; rvt_put_mr(sge->mr); } ss->num_sge = 0; memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr_id; wc.status = IB_WC_LOC_PROT_ERR; wc.opcode = IB_WC_RECV; wc.qp = &qp->ibqp; /* Signal solicited completion event. */ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); ret = 0; bail: return ret; }
/** * rvt_destroy_cq - destroy a completion queue * @ibcq: the completion queue to destroy. * * Called by ib_destroy_cq() in the generic verbs code. * * Return: always 0 */ int rvt_destroy_cq(struct ib_cq *ibcq) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_dev_info *rdi = cq->rdi; flush_kthread_work(&cq->comptask); spin_lock(&rdi->n_cqs_lock); rdi->n_cqs_allocated--; spin_unlock(&rdi->n_cqs_lock); if (cq->ip) kref_put(&cq->ip->ref, rvt_release_mmap_info); else vfree(cq->queue); kfree(cq); return 0; }
/** * rvt_req_notify_cq - change the notification type for a completion queue * @ibcq: the completion queue * @notify_flags: the type of notification to request * * This may be called from interrupt context. Also called by * ib_req_notify_cq() in the generic verbs code. * * Return: 0 for success. */ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); unsigned long flags; int ret = 0; spin_lock_irqsave(&cq->lock, flags); /* * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). */ if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && cq->queue->head != cq->queue->tail) ret = 1; spin_unlock_irqrestore(&cq->lock, flags); return ret; }
/** * rvt_error_qp - put a QP into the error state * @qp: the QP to put into the error state * @err: the receive completion error to signal if a RWQE is active * * Flushes both send and receive work queues. * * Return: true if last WQE event should be generated. * The QP r_lock and s_lock should be held and interrupts disabled. * If we are already in error state, just return. */ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) { struct ib_wc wc; int ret = 0; struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) goto bail; qp->state = IB_QPS_ERR; if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) { qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR); del_timer(&qp->s_timer); } if (qp->s_flags & RVT_S_ANY_WAIT_SEND) qp->s_flags &= ~RVT_S_ANY_WAIT_SEND; rdi->driver_f.notify_error_qp(qp); /* Schedule the sending tasklet to drain the send work queue. */ if (ACCESS_ONCE(qp->s_last) != qp->s_head) rdi->driver_f.schedule_send(qp); rvt_clear_mr_refs(qp, 0); memset(&wc, 0, sizeof(wc)); wc.qp = &qp->ibqp; wc.opcode = IB_WC_RECV; if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) { wc.wr_id = qp->r_wr_id; wc.status = err; rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); } wc.status = IB_WC_WR_FLUSH_ERR; if (qp->r_rq.wq) { struct rvt_rwq *wq; u32 head; u32 tail; spin_lock(&qp->r_rq.lock); /* sanity check pointers before trusting them */ wq = qp->r_rq.wq; head = wq->head; if (head >= qp->r_rq.size) head = 0; tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; while (tail != head) { wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id; if (++tail >= qp->r_rq.size) tail = 0; rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); } wq->tail = tail; spin_unlock(&qp->r_rq.lock); } else if (qp->ibqp.event_handler) { ret = 1; } bail: return ret; }
/** * rvt_resize_cq - change the size of the CQ * @ibcq: the completion queue * * Return: 0 for success. */ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_cq_wc *old_wc; struct rvt_cq_wc *wc; u32 head, tail, n; int ret; u32 sz; struct rvt_dev_info *rdi = cq->rdi; if (cqe < 1 || cqe > rdi->dparms.props.max_cqe) return -EINVAL; /* * Need to use vmalloc() if we want to support large #s of entries. */ sz = sizeof(*wc); if (udata && udata->outlen >= sizeof(__u64)) sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); else sz += sizeof(struct ib_wc) * (cqe + 1); wc = vmalloc_user(sz); if (!wc) return -ENOMEM; /* Check that we can write the offset to mmap. */ if (udata && udata->outlen >= sizeof(__u64)) { __u64 offset = 0; ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); if (ret) goto bail_free; } spin_lock_irq(&cq->lock); /* * Make sure head and tail are sane since they * might be user writable. */ old_wc = cq->queue; head = old_wc->head; if (head > (u32)cq->ibcq.cqe) head = (u32)cq->ibcq.cqe; tail = old_wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; if (head < tail) n = cq->ibcq.cqe + 1 + head - tail; else n = head - tail; if (unlikely((u32)cqe < n)) { ret = -EINVAL; goto bail_unlock; } for (n = 0; tail != head; n++) { if (cq->ip) wc->uqueue[n] = old_wc->uqueue[tail]; else wc->kqueue[n] = old_wc->kqueue[tail]; if (tail == (u32)cq->ibcq.cqe) tail = 0; else tail++; } cq->ibcq.cqe = cqe; wc->head = n; wc->tail = 0; cq->queue = wc; spin_unlock_irq(&cq->lock); vfree(old_wc); if (cq->ip) { struct rvt_mmap_info *ip = cq->ip; rvt_update_mmap_info(rdi, ip, sz, wc); /* * Return the offset to mmap. * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { ret = ib_copy_to_udata(udata, &ip->offset, sizeof(ip->offset)); if (ret) return ret; } spin_lock_irq(&rdi->pending_lock); if (list_empty(&ip->pending_mmaps)) list_add(&ip->pending_mmaps, &rdi->pending_mmaps); spin_unlock_irq(&rdi->pending_lock); } return 0; bail_unlock: spin_unlock_irq(&cq->lock); bail_free: vfree(wc); return ret; }
/** * hfi1_uc_rcv - handle an incoming UC packet * @ibp: the port the packet came in on * @hdr: the header of the packet * @rcv_flags: flags relevant to rcv processing * @data: the packet data * @tlen: the length of the packet * @qp: the QP for this packet. * * This is called from qp_rcv() to process an incoming UC packet * for the given QP. * Called at interrupt level. */ void hfi1_uc_rcv(struct hfi1_packet *packet) { struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; struct ib_other_headers *ohdr = packet->ohdr; u32 opcode = packet->opcode; u32 hdrsize = packet->hlen; u32 psn; u32 pad = packet->pad; struct ib_wc wc; u32 pmtu = qp->pmtu; struct ib_reth *reth; int ret; u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); if (hfi1_ruc_check_hdr(ibp, packet)) return; process_ecn(qp, packet, true); psn = ib_bth_get_psn(ohdr); /* Compare the PSN verses the expected PSN. */ if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) { /* * Handle a sequence error. * Silently drop any current message. */ qp->r_psn = psn; inv: if (qp->r_state == OP(SEND_FIRST) || qp->r_state == OP(SEND_MIDDLE)) { set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); qp->r_sge.num_sge = 0; } else { rvt_put_ss(&qp->r_sge); } qp->r_state = OP(SEND_LAST); switch (opcode) { case OP(SEND_FIRST): case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): goto send_first; case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_ONLY): case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): goto rdma_first; default: goto drop; } } /* Check for opcode sequence errors. */ switch (qp->r_state) { case OP(SEND_FIRST): case OP(SEND_MIDDLE): if (opcode == OP(SEND_MIDDLE) || opcode == OP(SEND_LAST) || opcode == OP(SEND_LAST_WITH_IMMEDIATE)) break; goto inv; case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_MIDDLE): if (opcode == OP(RDMA_WRITE_MIDDLE) || opcode == OP(RDMA_WRITE_LAST) || opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) break; goto inv; default: if (opcode == OP(SEND_FIRST) || opcode == OP(SEND_ONLY) || opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || opcode == OP(RDMA_WRITE_FIRST) || opcode == OP(RDMA_WRITE_ONLY) || opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) break; goto inv; } if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) rvt_comm_est(qp); /* OK, process the packet. */ switch (opcode) { case OP(SEND_FIRST): case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): send_first: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { qp->r_sge = qp->s_rdma_read_sge; } else { ret = hfi1_rvt_get_rwqe(qp, 0); if (ret < 0) goto op_err; if (!ret) goto drop; /* * qp->s_rdma_read_sge will be the owner * of the mr references. */ qp->s_rdma_read_sge = qp->r_sge; } qp->r_rcv_len = 0; if (opcode == OP(SEND_ONLY)) goto no_immediate_data; else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) goto send_last_imm; /* FALLTHROUGH */ case OP(SEND_MIDDLE): /* Check for invalid length PMTU or posted rwqe len. */ /* * There will be no padding for 9B packet but 16B packets * will come in with some padding since we always add * CRC and LT bytes which will need to be flit aligned */ if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) goto rewind; qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): send_last_imm: wc.ex.imm_data = ohdr->u.imm_data; wc.wc_flags = IB_WC_WITH_IMM; goto send_last; case OP(SEND_LAST): no_immediate_data: wc.ex.imm_data = 0; wc.wc_flags = 0; send_last: /* Check for invalid length. */ /* LAST len should be >= 1 */ if (unlikely(tlen < (hdrsize + extra_bytes))) goto rewind; /* Don't count the CRC. */ tlen -= (hdrsize + extra_bytes); wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; hfi1_copy_sge(&qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; /* * It seems that IB mandates the presence of an SL in a * work completion only for the UD transport (see section * 11.4.2 of IBTA Vol. 1). * * However, the way the SL is chosen below is consistent * with the way that IB/qib works and is trying avoid * introducing incompatibilities. * * See also OPA Vol. 1, section 9.7.6, and table 9-17. */ wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); /* zero fields that are N/A */ wc.vendor_err = 0; wc.pkey_index = 0; wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_ONLY): case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ rdma_first: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) { goto drop; } reth = &ohdr->u.rc.reth; qp->r_len = be32_to_cpu(reth->length); qp->r_rcv_len = 0; qp->r_sge.sg_list = NULL; if (qp->r_len != 0) { u32 rkey = be32_to_cpu(reth->rkey); u64 vaddr = be64_to_cpu(reth->vaddr); int ok; /* Check rkey */ ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, rkey, IB_ACCESS_REMOTE_WRITE); if (unlikely(!ok)) goto drop; qp->r_sge.num_sge = 1; } else { qp->r_sge.num_sge = 0; qp->r_sge.sge.mr = NULL; qp->r_sge.sge.vaddr = NULL; qp->r_sge.sge.length = 0; qp->r_sge.sge.sge_length = 0; } if (opcode == OP(RDMA_WRITE_ONLY)) { goto rdma_last; } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { wc.ex.imm_data = ohdr->u.rc.imm_data; goto rdma_last_imm; } /* FALLTHROUGH */ case OP(RDMA_WRITE_MIDDLE): /* Check for invalid length PMTU or posted rwqe len. */ if (unlikely(tlen != (hdrsize + pmtu + 4))) goto drop; qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): wc.ex.imm_data = ohdr->u.imm_data; rdma_last_imm: wc.wc_flags = IB_WC_WITH_IMM; /* Check for invalid length. */ /* LAST len should be >= 1 */ if (unlikely(tlen < (hdrsize + pad + 4))) goto drop; /* Don't count the CRC. */ tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { rvt_put_ss(&qp->s_rdma_read_sge); } else { ret = hfi1_rvt_get_rwqe(qp, 1); if (ret < 0) goto op_err; if (!ret) goto drop; } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; case OP(RDMA_WRITE_LAST): rdma_last: /* Check for invalid length. */ /* LAST len should be >= 1 */ if (unlikely(tlen < (hdrsize + pad + 4))) goto drop; /* Don't count the CRC. */ tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; default: /* Drop packet for unknown opcodes. */ goto drop; } qp->r_psn++; qp->r_state = opcode; return; rewind: set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); qp->r_sge.num_sge = 0; drop: ibp->rvp.n_pkt_drops++; return; op_err: rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); }