Пример #1
0
static void rxd_set_rx_credits(struct rxd_ep *ep, struct rxd_rx_entry *rx_entry)
{
	size_t num_pkts, avail, size_left;

	size_left = rx_entry->op_hdr.size - rx_entry->done;
	num_pkts = (size_left + rxd_ep_domain(ep)->max_mtu_sz - 1) /
		    rxd_ep_domain(ep)->max_mtu_sz;
	avail = MIN(ep->credits, num_pkts);
	rx_entry->credits = MIN(avail, RXD_MAX_RX_CREDITS);
	rx_entry->last_win_seg += rx_entry->credits;
	ep->credits -= rx_entry->credits;
}
Пример #2
0
void rxd_init_data_pkt(struct rxd_ep *ep, struct rxd_x_entry *tx_entry,
		       struct rxd_pkt_entry *pkt_entry)
{
	struct rxd_data_pkt *data_pkt = (struct rxd_data_pkt *) (pkt_entry->pkt);
	uint32_t seg_size;

	seg_size = tx_entry->cq_entry.len - tx_entry->bytes_done;
	seg_size = MIN(rxd_ep_domain(ep)->max_seg_sz, seg_size);

	data_pkt->base_hdr.version = RXD_PROTOCOL_VERSION;
	data_pkt->base_hdr.type = (tx_entry->cq_entry.flags &
				  (FI_READ | FI_REMOTE_READ)) ?
				   RXD_DATA_READ : RXD_DATA;

	data_pkt->ext_hdr.rx_id = tx_entry->rx_id;
	data_pkt->ext_hdr.tx_id = tx_entry->tx_id;
	data_pkt->ext_hdr.seg_no = tx_entry->next_seg_no++;
	data_pkt->base_hdr.peer = ep->peers[tx_entry->peer].peer_addr;

	pkt_entry->pkt_size = ofi_copy_from_iov(data_pkt->msg, seg_size,
						tx_entry->iov,
						tx_entry->iov_count,
						tx_entry->bytes_done);
	pkt_entry->peer = tx_entry->peer;

	tx_entry->bytes_done += pkt_entry->pkt_size;

	pkt_entry->pkt_size += sizeof(*data_pkt) + ep->tx_prefix_size;
}
Пример #3
0
static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep,
		const struct iovec *iov, size_t iov_count,
		const struct fi_rma_iov *rma_iov, size_t rma_count,
		fi_addr_t addr, void *context, uint32_t op, uint64_t data,
		uint32_t rxd_flags)
{
	struct rxd_x_entry *tx_entry;
	fi_addr_t rxd_addr;
	ssize_t ret = -FI_EAGAIN;

	assert(iov_count <= RXD_IOV_LIMIT && rma_count <= RXD_IOV_LIMIT);
	assert(ofi_total_iov_len(iov, iov_count) <= rxd_ep_domain(rxd_ep)->max_inline_rma);

	fastlock_acquire(&rxd_ep->util_ep.lock);
	fastlock_acquire(&rxd_ep->util_ep.tx_cq->cq_lock);

	if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq))
		goto out;

	rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr];
	ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr);
	if (ret)
		goto out;

	tx_entry = rxd_tx_entry_init(rxd_ep, iov, iov_count, NULL, 0, rma_count, data,
				     0, context, rxd_addr, op, rxd_flags);
	if (!tx_entry)
		goto out;

	ret = rxd_ep_send_op(rxd_ep, tx_entry, rma_iov, rma_count, NULL, 0, 0, 0);
	if (ret) {
		rxd_tx_entry_free(rxd_ep, tx_entry);
		goto out;
	}

	if (tx_entry->op == RXD_READ_REQ)
		goto out;

	ret = 0;

out:
	fastlock_release(&rxd_ep->util_ep.tx_cq->cq_lock);
	fastlock_release(&rxd_ep->util_ep.lock);
	return ret;
}
Пример #4
0
int rxd_ep_post_buf(struct rxd_ep *ep)
{
	struct rxd_pkt_entry *pkt_entry;
	ssize_t ret;

	pkt_entry = rxd_get_rx_pkt(ep);
	if (!pkt_entry)
		return -FI_ENOMEM;

	ret = fi_recv(ep->dg_ep, rxd_pkt_start(pkt_entry),
		      rxd_ep_domain(ep)->max_mtu_sz,
		      rxd_mr_desc(pkt_entry->mr, ep),
		      FI_ADDR_UNSPEC, &pkt_entry->context);
	if (ret) {
		ofi_buf_free(pkt_entry);
		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "failed to repost\n");
		return ret;
	}

	ep->posted_bufs++;
	slist_insert_tail(&pkt_entry->s_entry, &ep->rx_pkt_list);

	return 0;
}
Пример #5
0
static size_t rxd_avail_buf(struct rxd_ep *rxd_ep, struct rxd_base_hdr *hdr,
			    void *ptr)
{
	return rxd_ep_domain(rxd_ep)->max_inline_msg -
		((uintptr_t) ptr - (uintptr_t) hdr);
}
Пример #6
0
struct rxd_x_entry *rxd_tx_entry_init(struct rxd_ep *ep, const struct iovec *iov,
				      size_t iov_count, const struct iovec *res_iov,
				      size_t res_count, size_t rma_count,
				      uint64_t data, uint64_t tag, void *context,
				      fi_addr_t addr, uint32_t op, uint32_t flags)
{
	struct rxd_x_entry *tx_entry;
	struct rxd_domain *rxd_domain = rxd_ep_domain(ep);
	size_t max_inline;

	tx_entry = rxd_get_tx_entry(ep, op);
	if (!tx_entry) {
		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "could not get tx entry\n");
		return NULL;
	}

	tx_entry->op = op;
	tx_entry->peer = addr;
	tx_entry->flags = flags;
	tx_entry->bytes_done = 0;
	tx_entry->offset = 0;
	tx_entry->next_seg_no = 0;
	tx_entry->iov_count = iov_count;
	memcpy(&tx_entry->iov[0], iov, sizeof(*iov) * iov_count);
	if (res_count) {
		tx_entry->res_count = res_count;
		memcpy(&tx_entry->res_iov[0], res_iov, sizeof(*res_iov) * res_count);
	}

	tx_entry->cq_entry.op_context = context;
	tx_entry->cq_entry.len = ofi_total_iov_len(iov, iov_count);
	tx_entry->cq_entry.buf = iov[0].iov_base;
	tx_entry->cq_entry.flags = ofi_tx_cq_flags(op);
	tx_entry->cq_entry.tag = tag;

	tx_entry->pkt = NULL;

	max_inline = rxd_domain->max_inline_msg;
	if (tx_entry->cq_entry.flags & FI_RMA)
		max_inline -= sizeof(struct ofi_rma_iov) * rma_count;

	if (tx_entry->flags & RXD_TAG_HDR)
		max_inline -= sizeof(tx_entry->cq_entry.tag);
	if (tx_entry->flags & RXD_REMOTE_CQ_DATA) {
		max_inline -= sizeof(tx_entry->cq_entry.data);
		tx_entry->cq_entry.data = data;
	}

	if (rma_count > 1 || tx_entry->cq_entry.flags & FI_READ ||
	    tx_entry->cq_entry.len > max_inline)
		max_inline -= sizeof(struct rxd_sar_hdr);
	else
		tx_entry->flags |= RXD_INLINE;

	if (tx_entry->cq_entry.flags & FI_ATOMIC || tx_entry->cq_entry.len <= max_inline)
		tx_entry->num_segs = 1;
	else if (tx_entry->cq_entry.flags & FI_READ)
		tx_entry->num_segs = ofi_div_ceil(tx_entry->cq_entry.len,
						  rxd_domain->max_seg_sz);
	else
		tx_entry->num_segs = ofi_div_ceil(tx_entry->cq_entry.len - max_inline,
						  rxd_domain->max_seg_sz) + 1;

	if ((tx_entry->op == RXD_READ_REQ || tx_entry->op == RXD_ATOMIC_FETCH ||
	     tx_entry->op == RXD_ATOMIC_COMPARE) &&
	    ep->peers[tx_entry->peer].unacked_cnt < ep->peers[tx_entry->peer].tx_window &&
	    ep->peers[tx_entry->peer].peer_addr != FI_ADDR_UNSPEC)
		dlist_insert_tail(&tx_entry->entry,
				  &ep->peers[tx_entry->peer].rma_rx_list);
	else
		dlist_insert_tail(&tx_entry->entry,
				  &ep->peers[tx_entry->peer].tx_list);

	return tx_entry;
}
Пример #7
0
int rxd_ep_init_res(struct rxd_ep *ep, struct fi_info *fi_info)
{
	struct rxd_domain *rxd_domain = rxd_ep_domain(ep);
	struct ofi_bufpool_attr entry_pool_attr = {
		.size		= sizeof(struct rxd_x_entry),
		.alignment	= RXD_BUF_POOL_ALIGNMENT,
		.max_cnt	= 0,
		.flags		= OFI_BUFPOOL_INDEXED,
	};
	int ret;

	ret = ofi_bufpool_create_ex(&ep->tx_pkt_pool,
			rxd_domain->max_mtu_sz + sizeof(struct rxd_pkt_entry),
			RXD_BUF_POOL_ALIGNMENT, 0, RXD_TX_POOL_CHUNK_CNT,
			ep->do_local_mr ? rxd_buf_region_alloc_fn : NULL,
			ep->do_local_mr ? rxd_buf_region_free_fn : NULL,
			rxd_domain);
	if (ret)
		return ret;

	ret = ofi_bufpool_create_ex(&ep->rx_pkt_pool,
			rxd_domain->max_mtu_sz + sizeof (struct rxd_pkt_entry),
			RXD_BUF_POOL_ALIGNMENT, 0, RXD_RX_POOL_CHUNK_CNT,
			ep->do_local_mr ? rxd_buf_region_alloc_fn : NULL,
			ep->do_local_mr ? rxd_buf_region_free_fn : NULL,
			rxd_domain);
	if (ret)
		goto err;

	entry_pool_attr.flags |= OFI_BUFPOOL_NO_TRACK;
	entry_pool_attr.chunk_cnt = ep->tx_size;
	ret = ofi_bufpool_create_attr(&entry_pool_attr, &ep->tx_entry_pool);
	if (ret)
		goto err;

	entry_pool_attr.chunk_cnt = ep->rx_size;
	ret = ofi_bufpool_create_attr(&entry_pool_attr, &ep->rx_entry_pool);
	if (ret)
		goto err;

	dlist_init(&ep->rx_list);
	dlist_init(&ep->rx_tag_list);
	dlist_init(&ep->active_peers);
	dlist_init(&ep->rts_sent_list);
	dlist_init(&ep->unexp_list);
	dlist_init(&ep->unexp_tag_list);
	dlist_init(&ep->ctrl_pkts);
	slist_init(&ep->rx_pkt_list);

	return 0;
err:
	if (ep->tx_pkt_pool)
		ofi_bufpool_destroy(ep->tx_pkt_pool);

	if (ep->rx_pkt_pool)
		ofi_bufpool_destroy(ep->rx_pkt_pool);

	if (ep->tx_entry_pool)
		ofi_bufpool_destroy(ep->tx_entry_pool);

	if (ep->rx_entry_pool)
		ofi_bufpool_destroy(ep->rx_entry_pool);

	return ret;
}

static void rxd_init_peer(struct rxd_ep *ep, uint64_t rxd_addr)
{
	ep->peers[rxd_addr].peer_addr = FI_ADDR_UNSPEC;
	ep->peers[rxd_addr].tx_seq_no = 0;
	ep->peers[rxd_addr].rx_seq_no = 0;
	ep->peers[rxd_addr].last_rx_ack = 0;
	ep->peers[rxd_addr].last_tx_ack = 0;
	ep->peers[rxd_addr].rx_window = rxd_env.max_unacked;
	ep->peers[rxd_addr].tx_window = rxd_env.max_unacked;
	ep->peers[rxd_addr].unacked_cnt = 0;
	ep->peers[rxd_addr].retry_cnt = 0;
	ep->peers[rxd_addr].active = 0;
	dlist_init(&ep->peers[rxd_addr].unacked);
	dlist_init(&ep->peers[rxd_addr].tx_list);
	dlist_init(&ep->peers[rxd_addr].rx_list);
	dlist_init(&ep->peers[rxd_addr].rma_rx_list);
	dlist_init(&ep->peers[rxd_addr].buf_pkts);
}
Пример #8
0
int rxd_process_start_data(struct rxd_ep *ep, struct rxd_rx_entry *rx_entry,
			   struct rxd_peer *peer, struct ofi_ctrl_hdr *ctrl,
			   struct fi_cq_msg_entry *comp,
			   struct rxd_rx_buf *rx_buf)
{
	uint64_t idx;
	int i, offset, ret;
	struct ofi_rma_iov *rma_iov;
	struct rxd_pkt_data_start *pkt_start;
	struct rxd_tx_entry *tx_entry;
	pkt_start = (struct rxd_pkt_data_start *) ctrl;

	switch (rx_entry->op_hdr.op) {
	case ofi_op_msg:
		rx_entry->recv = rxd_get_recv_entry(ep, rx_entry);
		if (!rx_entry->recv) {
			if (ep->num_unexp_msg < RXD_EP_MAX_UNEXP_MSG) {
				dlist_insert_tail(&rx_entry->unexp_entry, &ep->unexp_msg_list);
				rx_entry->unexp_buf = rx_buf;
				ep->num_unexp_msg++;
				return -FI_ENOENT;
			} else {
				FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "dropping msg\n");
				return -FI_ENOMEM;
			}
		}

		rxd_ep_handle_data_msg(ep, peer, rx_entry, rx_entry->recv->iov,
				     rx_entry->recv->msg.iov_count, ctrl,
				     pkt_start->data, rx_buf);
		break;
	case ofi_op_tagged:
		rx_entry->trecv = rxd_get_trecv_entry(ep, rx_entry);
		if (!rx_entry->trecv) {
			if (ep->num_unexp_msg < RXD_EP_MAX_UNEXP_MSG) {
				dlist_insert_tail(&rx_entry->unexp_entry, &ep->unexp_tag_list);
				rx_entry->unexp_buf = rx_buf;
				ep->num_unexp_msg++;
				return -FI_ENOENT;
			} else {
				FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "dropping msg\n");
				return -FI_ENOMEM;
			}
		}

		rxd_ep_handle_data_msg(ep, peer, rx_entry, rx_entry->trecv->iov,
				     rx_entry->trecv->msg.iov_count, ctrl,
				     pkt_start->data, rx_buf);
		break;
	case ofi_op_write:
		rma_iov = (struct ofi_rma_iov *) pkt_start->data;
		for (i = 0; i < rx_entry->op_hdr.iov_count; i++) {
			ret = rxd_mr_verify(rxd_ep_domain(ep),
					    rma_iov[i].len,
					    (uintptr_t *) &rma_iov[i].addr,
					    rma_iov[i].key, FI_REMOTE_WRITE);
			if (ret) {
				/* todo: handle invalid key case */
				FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "invalid key/access permissions\n");
				return -FI_EACCES;
			}

			rx_entry->write.iov[i].iov_base = (void *) (uintptr_t) rma_iov[i].addr;
			rx_entry->write.iov[i].iov_len = rma_iov[i].len;
		}

		offset = sizeof(struct ofi_rma_iov) * rx_entry->op_hdr.iov_count;
		ctrl->seg_size -= offset;
		rxd_ep_handle_data_msg(ep, peer, rx_entry, rx_entry->write.iov,
				       rx_entry->op_hdr.iov_count, ctrl,
				       pkt_start->data + offset, rx_buf);
		break;
	case ofi_op_read_req:
		rma_iov = (struct ofi_rma_iov *) pkt_start->data;
		tx_entry = rxd_tx_entry_alloc(ep, peer, rx_entry->peer, 0,
						RXD_TX_READ_RSP);
		if (!tx_entry) {
			FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "no free tx-entry\n");
			return -FI_ENOMEM;
		}

		tx_entry->peer = rx_entry->peer;
		tx_entry->read_rsp.iov_count = rx_entry->op_hdr.iov_count;
		for (i = 0; i < rx_entry->op_hdr.iov_count; i++) {
			ret = rxd_mr_verify(rxd_ep_domain(ep),
					    rma_iov[i].len,
					    (uintptr_t *) &rma_iov[i].addr,
					    rma_iov[i].key, FI_REMOTE_READ);
			if (ret) {
				/* todo: handle invalid key case */
				FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "invalid key/access permissions\n");
				return -FI_EACCES;
			}

			tx_entry->read_rsp.src_iov[i].iov_base = (void *) (uintptr_t)
								rma_iov[i].addr;
			tx_entry->read_rsp.src_iov[i].iov_len = rma_iov[i].len;
		}
		tx_entry->read_rsp.peer_msg_id = ctrl->msg_id;
		ret = rxd_ep_start_xfer(ep, peer, ofi_op_read_rsp, tx_entry);
		if (ret)
			rxd_tx_entry_free(ep, tx_entry);
		rxd_rx_entry_free(ep, rx_entry);
		break;
	case ofi_op_read_rsp:
		idx = rx_entry->op_hdr.remote_idx & RXD_TX_IDX_BITS;
		tx_entry = &ep->tx_entry_fs->buf[idx];
		if (tx_entry->msg_id != rx_entry->op_hdr.remote_idx)
			return -FI_ENOMEM;

		rx_entry->read_rsp.tx_entry = tx_entry;
		rxd_ep_handle_data_msg(ep, peer, rx_entry, tx_entry->read_req.dst_iov,
				       tx_entry->read_req.msg.iov_count, ctrl,
				       pkt_start->data, rx_buf);
		break;
	case ofi_op_atomic:
	default:
		FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "invalid op type\n");
		return -FI_EINVAL;
	}
	return 0;
}