/* * Stamp a SQ WQE so that it is invalid if prefetched by marking the * first four bytes of every 64 byte chunk with 0xffffffff, except for * the very first chunk of the WQE. */ static void stamp_send_wqe(struct mlx4_qp *qp, int n) { uint32_t *wqe = get_send_wqe(qp, n); int i; int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2; for (i = 16; i < ds; i += 16) wqe[i] = 0xffffffff; }
void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp) { struct mlx4_wqe_ctrl_seg *ctrl; int i; for (i = 0; i < qp->sq.wqe_cnt; ++i) { ctrl = get_send_wqe(qp, i); ctrl->owner_opcode = htonl(1 << 31); ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); stamp_send_wqe(qp, i); } }
static int hns_roce_u_v1_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { unsigned int ind; void *wqe; int nreq; int ps_opcode, i; int ret = 0; struct hns_roce_wqe_ctrl_seg *ctrl = NULL; struct hns_roce_wqe_data_seg *dseg = NULL; struct hns_roce_qp *qp = to_hr_qp(ibvqp); struct hns_roce_context *ctx = to_hr_ctx(ibvqp->context); pthread_spin_lock(&qp->sq.lock); /* check that state is OK to post send */ ind = qp->sq.head; for (nreq = 0; wr; ++nreq, wr = wr->next) { if (hns_roce_wq_overflow(&qp->sq, nreq, to_hr_cq(qp->ibv_qp.send_cq))) { ret = -1; *bad_wr = wr; goto out; } if (wr->num_sge > qp->sq.max_gs) { ret = -1; *bad_wr = wr; printf("wr->num_sge(<=%d) = %d, check failed!\r\n", qp->sq.max_gs, wr->num_sge); goto out; } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); memset(ctrl, 0, sizeof(struct hns_roce_wqe_ctrl_seg)); qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; for (i = 0; i < wr->num_sge; i++) ctrl->msg_length = htole32(le32toh(ctrl->msg_length) + wr->sg_list[i].length); ctrl->flag |= htole32(((wr->send_flags & IBV_SEND_SIGNALED) ? HNS_ROCE_WQE_CQ_NOTIFY : 0) | (wr->send_flags & IBV_SEND_SOLICITED ? HNS_ROCE_WQE_SE : 0) | ((wr->opcode == IBV_WR_SEND_WITH_IMM || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) ? HNS_ROCE_WQE_IMM : 0) | (wr->send_flags & IBV_SEND_FENCE ? HNS_ROCE_WQE_FENCE : 0)); if (wr->opcode == IBV_WR_SEND_WITH_IMM || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) ctrl->imm_data = htole32(be32toh(wr->imm_data)); wqe += sizeof(struct hns_roce_wqe_ctrl_seg); /* set remote addr segment */ switch (ibvqp->qp_type) { case IBV_QPT_RC: switch (wr->opcode) { case IBV_WR_RDMA_READ: ps_opcode = HNS_ROCE_WQE_OPCODE_RDMA_READ; set_raddr_seg(wqe, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); break; case IBV_WR_RDMA_WRITE: case IBV_WR_RDMA_WRITE_WITH_IMM: ps_opcode = HNS_ROCE_WQE_OPCODE_RDMA_WRITE; set_raddr_seg(wqe, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); break; case IBV_WR_SEND: case IBV_WR_SEND_WITH_IMM: ps_opcode = HNS_ROCE_WQE_OPCODE_SEND; break; case IBV_WR_ATOMIC_CMP_AND_SWP: case IBV_WR_ATOMIC_FETCH_AND_ADD: default: ps_opcode = HNS_ROCE_WQE_OPCODE_MASK; break; } ctrl->flag |= htole32(ps_opcode); wqe += sizeof(struct hns_roce_wqe_raddr_seg); break; case IBV_QPT_UC: case IBV_QPT_UD: default: break; } dseg = wqe; /* Inline */ if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { if (le32toh(ctrl->msg_length) > qp->max_inline_data) { ret = -1; *bad_wr = wr; printf("inline data len(1-32)=%d, send_flags = 0x%x, check failed!\r\n", wr->send_flags, ctrl->msg_length); return ret; } for (i = 0; i < wr->num_sge; i++) { memcpy(wqe, ((void *) (uintptr_t) wr->sg_list[i].addr), wr->sg_list[i].length); wqe = wqe + wr->sg_list[i].length; } ctrl->flag |= htole32(HNS_ROCE_WQE_INLINE); } else { /* set sge */ for (i = 0; i < wr->num_sge; i++) set_data_seg(dseg+i, wr->sg_list + i); ctrl->flag |= htole32(wr->num_sge << HNS_ROCE_WQE_SGE_NUM_BIT); } ind++; } out: /* Set DB return */ if (likely(nreq)) { qp->sq.head += nreq; hns_roce_update_sq_head(ctx, qp->ibv_qp.qp_num, qp->port_num - 1, qp->sl, qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)); } pthread_spin_unlock(&qp->sq.lock); return ret; }
int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { struct mlx4_context *ctx; struct mlx4_qp *qp = to_mqp(ibqp); void *wqe; struct mlx4_wqe_ctrl_seg *ctrl; int ind; int nreq; int inl = 0; int ret = 0; int size; int i; pthread_spin_lock(&qp->sq.lock); /* XXX check that state is OK to post send */ ind = qp->sq.head; for (nreq = 0; wr; ++nreq, wr = wr->next) { if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) { ret = -1; *bad_wr = wr; goto out; } if (wr->num_sge > qp->sq.max_gs) { ret = -1; *bad_wr = wr; goto out; } if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { ret = -1; *bad_wr = wr; goto out; } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; ctrl->xrcrb_flags = (wr->send_flags & IBV_SEND_SIGNALED ? htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | (wr->send_flags & IBV_SEND_SOLICITED ? htonl(MLX4_WQE_CTRL_SOLICIT) : 0) | qp->sq_signal_bits; if (wr->opcode == IBV_WR_SEND_WITH_IMM || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) ctrl->imm = wr->imm_data; else ctrl->imm = 0; wqe += sizeof *ctrl; size = sizeof *ctrl / 16; switch (ibqp->qp_type) { case IBV_QPT_XRC: ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8); /* fall thru */ case IBV_QPT_RC: case IBV_QPT_UC: switch (wr->opcode) { case IBV_WR_ATOMIC_CMP_AND_SWP: case IBV_WR_ATOMIC_FETCH_AND_ADD: set_raddr_seg(wqe, wr->wr.atomic.remote_addr, wr->wr.atomic.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); set_atomic_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_atomic_seg); size += (sizeof (struct mlx4_wqe_raddr_seg) + sizeof (struct mlx4_wqe_atomic_seg)) / 16; break; case IBV_WR_RDMA_READ: inl = 1; /* fall through */ case IBV_WR_RDMA_WRITE: case IBV_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(wqe, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); size += sizeof (struct mlx4_wqe_raddr_seg) / 16; break; default: /* No extra segments required for sends */ break; } break; case IBV_QPT_UD: set_datagram_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (to_mah(wr->wr.ud.ah)->tagged) { ctrl->ins_vlan = 1 << 6; ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan); } break; default: break; } if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { struct mlx4_wqe_inline_seg *seg; void *addr; int len, seg_len; int num_seg; int off, to_copy; inl = 0; seg = wqe; wqe += sizeof *seg; off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); num_seg = 0; seg_len = 0; for (i = 0; i < wr->num_sge; ++i) { addr = (void *) (uintptr_t) wr->sg_list[i].addr; len = wr->sg_list[i].length; inl += len; if (inl > qp->max_inline_data) { inl = 0; ret = -1; *bad_wr = wr; goto out; } while (len >= MLX4_INLINE_ALIGN - off) { to_copy = MLX4_INLINE_ALIGN - off; memcpy(wqe, addr, to_copy); len -= to_copy; wqe += to_copy; addr += to_copy; seg_len += to_copy; wmb(); /* see comment below */ seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); seg_len = 0; seg = wqe; wqe += sizeof *seg; off = sizeof *seg; ++num_seg; } memcpy(wqe, addr, len); wqe += len; seg_len += len; off += len; } if (seg_len) { ++num_seg; /* * Need a barrier here to make sure * all the data is visible before the * byte_count field is set. Otherwise * the HCA prefetcher could grab the * 64-byte chunk with this inline * segment and get a valid (!= * 0xffffffff) byte count but stale * data, and end up sending the wrong * data. */ wmb(); seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); } size += (inl + num_seg * sizeof * seg + 15) / 16; } else { struct mlx4_wqe_data_seg *seg = wqe; for (i = wr->num_sge - 1; i >= 0 ; --i) set_data_seg(seg + i, wr->sg_list + i); size += wr->num_sge * (sizeof *seg / 16); } ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start * executing as soon as we do). */ wmb(); ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) | (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0); /* * We can improve latency by not stamping the last * send queue WQE until after ringing the doorbell, so * only stamp here if there are still more WQEs to post. */ if (wr->next) stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & (qp->sq.wqe_cnt - 1)); ++ind; } out: ctx = to_mctx(ibqp->context); if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) { ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8); *(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; /* * Make sure that descriptor is written to memory * before writing to BlueFlame page. */ wmb(); ++qp->sq.head; pthread_spin_lock(&ctx->bf_lock); mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl, align(size * 16, 64)); wc_wmb(); ctx->bf_offset ^= ctx->bf_buf_size; pthread_spin_unlock(&ctx->bf_lock); } else if (nreq) { qp->sq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn; } if (nreq) stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & (qp->sq.wqe_cnt - 1)); pthread_spin_unlock(&qp->sq.lock); return ret; }
static int hns_roce_v1_poll_one(struct hns_roce_cq *cq, struct hns_roce_qp **cur_qp, struct ibv_wc *wc) { uint32_t qpn; int is_send; uint16_t wqe_ctr; uint32_t local_qpn; struct hns_roce_wq *wq = NULL; struct hns_roce_cqe *cqe = NULL; struct hns_roce_wqe_ctrl_seg *sq_wqe = NULL; /* According to CI, find the relative cqe */ cqe = next_cqe_sw(cq); if (!cqe) return CQ_EMPTY; /* Get the next cqe, CI will be added gradually */ ++cq->cons_index; udma_from_device_barrier(); qpn = roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M, CQE_BYTE_16_LOCAL_QPN_S); is_send = (roce_get_bit(cqe->cqe_byte_4, CQE_BYTE_4_SQ_RQ_FLAG_S) == HNS_ROCE_CQE_IS_SQ); local_qpn = roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M, CQE_BYTE_16_LOCAL_QPN_S); /* if qp is zero, it will not get the correct qpn */ if (!*cur_qp || (local_qpn & HNS_ROCE_CQE_QPN_MASK) != (*cur_qp)->ibv_qp.qp_num) { *cur_qp = hns_roce_find_qp(to_hr_ctx(cq->ibv_cq.context), qpn & 0xffffff); if (!*cur_qp) { fprintf(stderr, PFX "can't find qp!\n"); return CQ_POLL_ERR; } } wc->qp_num = qpn & 0xffffff; if (is_send) { wq = &(*cur_qp)->sq; /* * if sq_signal_bits is 1, the tail pointer first update to * the wqe corresponding the current cqe */ if ((*cur_qp)->sq_signal_bits) { wqe_ctr = (uint16_t)(roce_get_field(cqe->cqe_byte_4, CQE_BYTE_4_WQE_INDEX_M, CQE_BYTE_4_WQE_INDEX_S)); /* * wq->tail will plus a positive number every time, * when wq->tail exceeds 32b, it is 0 and acc */ wq->tail += (wqe_ctr - (uint16_t) wq->tail) & (wq->wqe_cnt - 1); } /* write the wr_id of wq into the wc */ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } else { wq = &(*cur_qp)->rq; wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } /* * HW maintains wc status, set the err type and directly return, after * generated the incorrect CQE */ if (roce_get_field(cqe->cqe_byte_4, CQE_BYTE_4_STATUS_OF_THE_OPERATION_M, CQE_BYTE_4_STATUS_OF_THE_OPERATION_S) != HNS_ROCE_CQE_SUCCESS) { hns_roce_handle_error_cqe(cqe, wc); return CQ_OK; } wc->status = IBV_WC_SUCCESS; /* * According to the opcode type of cqe, mark the opcode and other * information of wc */ if (is_send) { /* Get opcode and flag before update the tail point for send */ sq_wqe = (struct hns_roce_wqe_ctrl_seg *) get_send_wqe(*cur_qp, roce_get_field(cqe->cqe_byte_4, CQE_BYTE_4_WQE_INDEX_M, CQE_BYTE_4_WQE_INDEX_S)); switch (le32toh(sq_wqe->flag) & HNS_ROCE_WQE_OPCODE_MASK) { case HNS_ROCE_WQE_OPCODE_SEND: wc->opcode = IBV_WC_SEND; break; case HNS_ROCE_WQE_OPCODE_RDMA_READ: wc->opcode = IBV_WC_RDMA_READ; wc->byte_len = le32toh(cqe->byte_cnt); break; case HNS_ROCE_WQE_OPCODE_RDMA_WRITE: wc->opcode = IBV_WC_RDMA_WRITE; break; case HNS_ROCE_WQE_OPCODE_BIND_MW2: wc->opcode = IBV_WC_BIND_MW; break; default: wc->status = IBV_WC_GENERAL_ERR; break; } wc->wc_flags = (le32toh(sq_wqe->flag) & HNS_ROCE_WQE_IMM ? IBV_WC_WITH_IMM : 0); } else { /* Get opcode and flag in rq&srq */ wc->byte_len = le32toh(cqe->byte_cnt); switch (roce_get_field(cqe->cqe_byte_4, CQE_BYTE_4_OPERATION_TYPE_M, CQE_BYTE_4_OPERATION_TYPE_S) & HNS_ROCE_CQE_OPCODE_MASK) { case HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE: wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IBV_WC_WITH_IMM; wc->imm_data = htobe32(le32toh(cqe->immediate_data)); break; case HNS_ROCE_OPCODE_SEND_DATA_RECEIVE: if (roce_get_bit(cqe->cqe_byte_4, CQE_BYTE_4_IMMEDIATE_DATA_FLAG_S)) { wc->opcode = IBV_WC_RECV; wc->wc_flags = IBV_WC_WITH_IMM; wc->imm_data = htobe32(le32toh(cqe->immediate_data)); } else { wc->opcode = IBV_WC_RECV; wc->wc_flags = 0; } break; default: wc->status = IBV_WC_GENERAL_ERR; break; } } return CQ_OK; }