Пример #1
0
/*
 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
 * first four bytes of every 64 byte chunk with 0xffffffff, except for
 * the very first chunk of the WQE.
 */
static void stamp_send_wqe(struct mlx4_qp *qp, int n)
{
	uint32_t *wqe = get_send_wqe(qp, n);
	int i;
	int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;

	for (i = 16; i < ds; i += 16)
		wqe[i] = 0xffffffff;
}
Пример #2
0
void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
{
	struct mlx4_wqe_ctrl_seg *ctrl;
	int i;

	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
		ctrl = get_send_wqe(qp, i);
		ctrl->owner_opcode = htonl(1 << 31);
		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);

		stamp_send_wqe(qp, i);
	}
}
Пример #3
0
static int hns_roce_u_v1_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr,
				   struct ibv_send_wr **bad_wr)
{
	unsigned int ind;
	void *wqe;
	int nreq;
	int ps_opcode, i;
	int ret = 0;
	struct hns_roce_wqe_ctrl_seg *ctrl = NULL;
	struct hns_roce_wqe_data_seg *dseg = NULL;
	struct hns_roce_qp *qp = to_hr_qp(ibvqp);
	struct hns_roce_context *ctx = to_hr_ctx(ibvqp->context);

	pthread_spin_lock(&qp->sq.lock);

	/* check that state is OK to post send */
	ind = qp->sq.head;

	for (nreq = 0; wr; ++nreq, wr = wr->next) {
		if (hns_roce_wq_overflow(&qp->sq, nreq,
					 to_hr_cq(qp->ibv_qp.send_cq))) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}
		if (wr->num_sge > qp->sq.max_gs) {
			ret = -1;
			*bad_wr = wr;
			printf("wr->num_sge(<=%d) = %d, check failed!\r\n",
				qp->sq.max_gs, wr->num_sge);
			goto out;
		}

		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
		memset(ctrl, 0, sizeof(struct hns_roce_wqe_ctrl_seg));

		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
		for (i = 0; i < wr->num_sge; i++)
			ctrl->msg_length = htole32(le32toh(ctrl->msg_length) +
						   wr->sg_list[i].length);

		ctrl->flag |= htole32(((wr->send_flags & IBV_SEND_SIGNALED) ?
				HNS_ROCE_WQE_CQ_NOTIFY : 0) |
			      (wr->send_flags & IBV_SEND_SOLICITED ?
				HNS_ROCE_WQE_SE : 0) |
			      ((wr->opcode == IBV_WR_SEND_WITH_IMM ||
			       wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) ?
				HNS_ROCE_WQE_IMM : 0) |
			      (wr->send_flags & IBV_SEND_FENCE ?
				HNS_ROCE_WQE_FENCE : 0));

		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
			ctrl->imm_data = htole32(be32toh(wr->imm_data));

		wqe += sizeof(struct hns_roce_wqe_ctrl_seg);

		/* set remote addr segment */
		switch (ibvqp->qp_type) {
		case IBV_QPT_RC:
			switch (wr->opcode) {
			case IBV_WR_RDMA_READ:
				ps_opcode = HNS_ROCE_WQE_OPCODE_RDMA_READ;
				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
					      wr->wr.rdma.rkey);
				break;
			case IBV_WR_RDMA_WRITE:
			case IBV_WR_RDMA_WRITE_WITH_IMM:
				ps_opcode = HNS_ROCE_WQE_OPCODE_RDMA_WRITE;
				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
					      wr->wr.rdma.rkey);
				break;
			case IBV_WR_SEND:
			case IBV_WR_SEND_WITH_IMM:
				ps_opcode = HNS_ROCE_WQE_OPCODE_SEND;
				break;
			case IBV_WR_ATOMIC_CMP_AND_SWP:
			case IBV_WR_ATOMIC_FETCH_AND_ADD:
			default:
				ps_opcode = HNS_ROCE_WQE_OPCODE_MASK;
				break;
			}
			ctrl->flag |= htole32(ps_opcode);
			wqe  += sizeof(struct hns_roce_wqe_raddr_seg);
			break;
		case IBV_QPT_UC:
		case IBV_QPT_UD:
		default:
			break;
		}

		dseg = wqe;

		/* Inline */
		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
			if (le32toh(ctrl->msg_length) > qp->max_inline_data) {
				ret = -1;
				*bad_wr = wr;
				printf("inline data len(1-32)=%d, send_flags = 0x%x, check failed!\r\n",
					wr->send_flags, ctrl->msg_length);
				return ret;
			}

			for (i = 0; i < wr->num_sge; i++) {
				memcpy(wqe,
				     ((void *) (uintptr_t) wr->sg_list[i].addr),
				     wr->sg_list[i].length);
				wqe = wqe + wr->sg_list[i].length;
			}

			ctrl->flag |= htole32(HNS_ROCE_WQE_INLINE);
		} else {
			/* set sge */
			for (i = 0; i < wr->num_sge; i++)
				set_data_seg(dseg+i, wr->sg_list + i);

			ctrl->flag |=
			       htole32(wr->num_sge << HNS_ROCE_WQE_SGE_NUM_BIT);
		}

		ind++;
	}

out:
	/* Set DB return */
	if (likely(nreq)) {
		qp->sq.head += nreq;

		hns_roce_update_sq_head(ctx, qp->ibv_qp.qp_num,
				qp->port_num - 1, qp->sl,
				qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1));
	}

	pthread_spin_unlock(&qp->sq.lock);

	return ret;
}
Пример #4
0
int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
			  struct ibv_send_wr **bad_wr)
{
	struct mlx4_context *ctx;
	struct mlx4_qp *qp = to_mqp(ibqp);
	void *wqe;
	struct mlx4_wqe_ctrl_seg *ctrl;
	int ind;
	int nreq;
	int inl = 0;
	int ret = 0;
	int size;
	int i;

	pthread_spin_lock(&qp->sq.lock);

	/* XXX check that state is OK to post send */

	ind = qp->sq.head;

	for (nreq = 0; wr; ++nreq, wr = wr->next) {
		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		if (wr->num_sge > qp->sq.max_gs) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
			ret = -1;
			*bad_wr = wr;
			goto out;
		}

		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;

		ctrl->xrcrb_flags =
			(wr->send_flags & IBV_SEND_SIGNALED ?
			 htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
			(wr->send_flags & IBV_SEND_SOLICITED ?
			 htonl(MLX4_WQE_CTRL_SOLICIT) : 0)   |
			qp->sq_signal_bits;

		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
			ctrl->imm = wr->imm_data;
		else
			ctrl->imm = 0;

		wqe += sizeof *ctrl;
		size = sizeof *ctrl / 16;

		switch (ibqp->qp_type) {
		case IBV_QPT_XRC:
			ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8);
			/* fall thru */
		case IBV_QPT_RC:
		case IBV_QPT_UC:
			switch (wr->opcode) {
			case IBV_WR_ATOMIC_CMP_AND_SWP:
			case IBV_WR_ATOMIC_FETCH_AND_ADD:
				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
					      wr->wr.atomic.rkey);
				wqe  += sizeof (struct mlx4_wqe_raddr_seg);

				set_atomic_seg(wqe, wr);
				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
				size += (sizeof (struct mlx4_wqe_raddr_seg) +
					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;

				break;

			case IBV_WR_RDMA_READ:
				inl = 1;
				/* fall through */
			case IBV_WR_RDMA_WRITE:
			case IBV_WR_RDMA_WRITE_WITH_IMM:
				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
					      wr->wr.rdma.rkey);
				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;

				break;

			default:
				/* No extra segments required for sends */
				break;
			}
			break;

		case IBV_QPT_UD:
			set_datagram_seg(wqe, wr);
			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
			if (to_mah(wr->wr.ud.ah)->tagged) {
				ctrl->ins_vlan = 1 << 6;
				ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan);
			}

			break;

		default:
			break;
		}

		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
			struct mlx4_wqe_inline_seg *seg;
			void *addr;
			int len, seg_len;
			int num_seg;
			int off, to_copy;

			inl = 0;

			seg = wqe;
			wqe += sizeof *seg;
			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
			num_seg = 0;
			seg_len = 0;

			for (i = 0; i < wr->num_sge; ++i) {
				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
				len  = wr->sg_list[i].length;
				inl += len;

				if (inl > qp->max_inline_data) {
					inl = 0;
					ret = -1;
					*bad_wr = wr;
					goto out;
				}

				while (len >= MLX4_INLINE_ALIGN - off) {
					to_copy = MLX4_INLINE_ALIGN - off;
					memcpy(wqe, addr, to_copy);
					len -= to_copy;
					wqe += to_copy;
					addr += to_copy;
					seg_len += to_copy;
					wmb(); /* see comment below */
					seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
					seg_len = 0;
					seg = wqe;
					wqe += sizeof *seg;
					off = sizeof *seg;
					++num_seg;
				}

				memcpy(wqe, addr, len);
				wqe += len;
				seg_len += len;
				off += len;
			}

			if (seg_len) {
				++num_seg;
				/*
				 * Need a barrier here to make sure
				 * all the data is visible before the
				 * byte_count field is set.  Otherwise
				 * the HCA prefetcher could grab the
				 * 64-byte chunk with this inline
				 * segment and get a valid (!=
				 * 0xffffffff) byte count but stale
				 * data, and end up sending the wrong
				 * data.
				 */
				wmb();
				seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
			}

			size += (inl + num_seg * sizeof * seg + 15) / 16;
		} else {
			struct mlx4_wqe_data_seg *seg = wqe;

			for (i = wr->num_sge - 1; i >= 0 ; --i)
				set_data_seg(seg + i, wr->sg_list + i);

			size += wr->num_sge * (sizeof *seg / 16);
		}

		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
				    MLX4_WQE_CTRL_FENCE : 0) | size;

		/*
		 * Make sure descriptor is fully written before
		 * setting ownership bit (because HW can start
		 * executing as soon as we do).
		 */
		wmb();

		ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
			(ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);

		/*
		 * We can improve latency by not stamping the last
		 * send queue WQE until after ringing the doorbell, so
		 * only stamp here if there are still more WQEs to post.
		 */
		if (wr->next)
			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
				       (qp->sq.wqe_cnt - 1));

		++ind;
	}

out:
	ctx = to_mctx(ibqp->context);

	if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) {
		ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
		*(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
		/*
		 * Make sure that descriptor is written to memory
		 * before writing to BlueFlame page.
		 */
		wmb();

		++qp->sq.head;

		pthread_spin_lock(&ctx->bf_lock);

		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
			     align(size * 16, 64));
		wc_wmb();

		ctx->bf_offset ^= ctx->bf_buf_size;

		pthread_spin_unlock(&ctx->bf_lock);
	} else if (nreq) {
		qp->sq.head += nreq;

		/*
		 * Make sure that descriptors are written before
		 * doorbell record.
		 */
		wmb();

		*(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
	}

	if (nreq)
		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
			       (qp->sq.wqe_cnt - 1));

	pthread_spin_unlock(&qp->sq.lock);

	return ret;
}
Пример #5
0
static int hns_roce_v1_poll_one(struct hns_roce_cq *cq,
				struct hns_roce_qp **cur_qp, struct ibv_wc *wc)
{
	uint32_t qpn;
	int is_send;
	uint16_t wqe_ctr;
	uint32_t local_qpn;
	struct hns_roce_wq *wq = NULL;
	struct hns_roce_cqe *cqe = NULL;
	struct hns_roce_wqe_ctrl_seg *sq_wqe = NULL;

	/* According to CI, find the relative cqe */
	cqe = next_cqe_sw(cq);
	if (!cqe)
		return CQ_EMPTY;

	/* Get the next cqe, CI will be added gradually */
	++cq->cons_index;

	udma_from_device_barrier();

	qpn = roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M,
			     CQE_BYTE_16_LOCAL_QPN_S);

	is_send = (roce_get_bit(cqe->cqe_byte_4, CQE_BYTE_4_SQ_RQ_FLAG_S) ==
		   HNS_ROCE_CQE_IS_SQ);

	local_qpn = roce_get_field(cqe->cqe_byte_16, CQE_BYTE_16_LOCAL_QPN_M,
				   CQE_BYTE_16_LOCAL_QPN_S);

	/* if qp is zero, it will not get the correct qpn */
	if (!*cur_qp ||
	    (local_qpn & HNS_ROCE_CQE_QPN_MASK) != (*cur_qp)->ibv_qp.qp_num) {

		*cur_qp = hns_roce_find_qp(to_hr_ctx(cq->ibv_cq.context),
					   qpn & 0xffffff);
		if (!*cur_qp) {
			fprintf(stderr, PFX "can't find qp!\n");
			return CQ_POLL_ERR;
		}
	}
	wc->qp_num = qpn & 0xffffff;

	if (is_send) {
		wq = &(*cur_qp)->sq;
		/*
		 * if sq_signal_bits is 1, the tail pointer first update to
		 * the wqe corresponding the current cqe
		 */
		if ((*cur_qp)->sq_signal_bits) {
			wqe_ctr = (uint16_t)(roce_get_field(cqe->cqe_byte_4,
						CQE_BYTE_4_WQE_INDEX_M,
						CQE_BYTE_4_WQE_INDEX_S));
			/*
			 * wq->tail will plus a positive number every time,
			 * when wq->tail exceeds 32b, it is 0 and acc
			 */
			wq->tail += (wqe_ctr - (uint16_t) wq->tail) &
				    (wq->wqe_cnt - 1);
		}
		/* write the wr_id of wq into the wc */
		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
		++wq->tail;
	} else {
		wq = &(*cur_qp)->rq;
		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
		++wq->tail;
	}

	/*
	 * HW maintains wc status, set the err type and directly return, after
	 * generated the incorrect CQE
	 */
	if (roce_get_field(cqe->cqe_byte_4,
	    CQE_BYTE_4_STATUS_OF_THE_OPERATION_M,
	    CQE_BYTE_4_STATUS_OF_THE_OPERATION_S) != HNS_ROCE_CQE_SUCCESS) {
		hns_roce_handle_error_cqe(cqe, wc);
		return CQ_OK;
	}
	wc->status = IBV_WC_SUCCESS;

	/*
	 * According to the opcode type of cqe, mark the opcode and other
	 * information of wc
	 */
	if (is_send) {
		/* Get opcode and flag before update the tail point for send */
		sq_wqe = (struct hns_roce_wqe_ctrl_seg *)
			 get_send_wqe(*cur_qp, roce_get_field(cqe->cqe_byte_4,
						CQE_BYTE_4_WQE_INDEX_M,
						CQE_BYTE_4_WQE_INDEX_S));
		switch (le32toh(sq_wqe->flag) & HNS_ROCE_WQE_OPCODE_MASK) {
		case HNS_ROCE_WQE_OPCODE_SEND:
			wc->opcode = IBV_WC_SEND;
			break;
		case HNS_ROCE_WQE_OPCODE_RDMA_READ:
			wc->opcode = IBV_WC_RDMA_READ;
			wc->byte_len = le32toh(cqe->byte_cnt);
			break;
		case HNS_ROCE_WQE_OPCODE_RDMA_WRITE:
			wc->opcode = IBV_WC_RDMA_WRITE;
			break;
		case HNS_ROCE_WQE_OPCODE_BIND_MW2:
			wc->opcode = IBV_WC_BIND_MW;
			break;
		default:
			wc->status = IBV_WC_GENERAL_ERR;
			break;
		}
		wc->wc_flags = (le32toh(sq_wqe->flag) & HNS_ROCE_WQE_IMM ?
				IBV_WC_WITH_IMM : 0);
	} else {
		/* Get opcode and flag in rq&srq */
		wc->byte_len = le32toh(cqe->byte_cnt);

		switch (roce_get_field(cqe->cqe_byte_4,
				       CQE_BYTE_4_OPERATION_TYPE_M,
				       CQE_BYTE_4_OPERATION_TYPE_S) &
			HNS_ROCE_CQE_OPCODE_MASK) {
		case HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE:
			wc->opcode   = IBV_WC_RECV_RDMA_WITH_IMM;
			wc->wc_flags = IBV_WC_WITH_IMM;
			wc->imm_data = htobe32(le32toh(cqe->immediate_data));
			break;
		case HNS_ROCE_OPCODE_SEND_DATA_RECEIVE:
			if (roce_get_bit(cqe->cqe_byte_4,
					 CQE_BYTE_4_IMMEDIATE_DATA_FLAG_S)) {
				wc->opcode   = IBV_WC_RECV;
				wc->wc_flags = IBV_WC_WITH_IMM;
				wc->imm_data =
					htobe32(le32toh(cqe->immediate_data));
			} else {
				wc->opcode   = IBV_WC_RECV;
				wc->wc_flags = 0;
			}
			break;
		default:
			wc->status = IBV_WC_GENERAL_ERR;
			break;
		}
	}

	return CQ_OK;
}