示例#1
0
/* Must call with cq->lock held */
ssize_t fi_ibv_poll_cq(struct fi_ibv_cq *cq, struct ibv_wc *wc)
{
	struct fi_ibv_msg_epe *epe;
	struct slist_entry *entry;
	ssize_t ret;

	ret = ibv_poll_cq(cq->cq, 1, wc);
	if (ret <= 0)
		return ret;

	if (wc->opcode == IBV_WC_RECV || wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM)
		return ret;

	/* TODO Handle the case when app posts a send with same wr_id */
	if ((wc->wr_id & cq->wr_id_mask) != cq->send_signal_wr_id)
		return ret;

	entry = slist_remove_first_match(&cq->ep_list, fi_ibv_match_ep_id, (void *)wc->wr_id);
	if (!entry) {
		FI_WARN(&fi_ibv_prov, FI_LOG_CQ, "No matching EP for :"
				"given signaled send completion\n");
		return -FI_EOTHER;
	}
	epe = container_of(entry, struct fi_ibv_msg_epe, entry);
	atomic_sub(&epe->ep->unsignaled_send_cnt,
			VERBS_SEND_SIGNAL_THRESH(epe->ep));
	atomic_dec(&epe->ep->comp_pending);
	util_buf_release(cq->domain->fab->epe_pool, epe);

	return 0;
}
示例#2
0
static inline void rxd_release_unexp_entry(struct rxd_cq *cq,
					    struct fi_cq_msg_entry *comp)
{
	struct rxd_unexp_cq_entry *unexp;
	unexp = container_of(comp, struct rxd_unexp_cq_entry, cq_entry);
	dlist_remove(&unexp->entry);
	util_buf_release(cq->unexp_pool, unexp);
}
示例#3
0
static ssize_t
mrail_tsend_common(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
		   size_t count, size_t len, fi_addr_t dest_addr, uint64_t tag,
		   uint64_t data, void *context, uint64_t flags)
{
	struct mrail_ep *mrail_ep = container_of(ep_fid, struct mrail_ep,
						 util_ep.ep_fid.fid);
	struct mrail_peer_info *peer_info;
	struct iovec *iov_dest = alloca(sizeof(*iov_dest) * (count + 1));
	struct mrail_tx_buf *tx_buf;
	uint32_t i = mrail_get_tx_rail(mrail_ep);
	struct fi_msg msg;
	ssize_t ret;

	peer_info = ofi_av_get_addr(mrail_ep->util_ep.av, (int) dest_addr);

	ofi_ep_lock_acquire(&mrail_ep->util_ep);

	tx_buf = mrail_get_tx_buf(mrail_ep, context, peer_info->seq_no++,
				  ofi_op_tagged, flags | FI_TAGGED);
	if (OFI_UNLIKELY(!tx_buf)) {
		ret = -FI_ENOMEM;
		goto err1;
	}
	tx_buf->hdr.tag = tag;
	mrail_copy_iov_hdr(&tx_buf->hdr, iov_dest, iov, count);

	msg.msg_iov 	= iov_dest;
	msg.desc    	= desc;
	msg.iov_count	= count + 1;
	msg.addr	= dest_addr;
	msg.context	= tx_buf;
	msg.data	= data;

	if (len < mrail_ep->rails[i].info->tx_attr->inject_size)
		flags |= FI_INJECT;

	FI_DBG(&mrail_prov, FI_LOG_EP_DATA, "Posting tsend of length: %" PRIu64
	       " dest_addr: 0x%" PRIx64 " tag: 0x%" PRIx64 " seq: %d"
	       " on rail: %d\n", len, dest_addr, tag, peer_info->seq_no - 1, i);

	ret = fi_sendmsg(mrail_ep->rails[i].ep, &msg, flags);
	if (ret) {
		FI_WARN(&mrail_prov, FI_LOG_EP_DATA,
			"Unable to fi_sendmsg on rail: %" PRIu32 "\n", i);
		goto err2;
	} else if (!(flags & FI_COMPLETION)) {
		ofi_ep_tx_cntr_inc(&mrail_ep->util_ep);
	}
	ofi_ep_lock_release(&mrail_ep->util_ep);
	return ret;
err2:
	util_buf_release(mrail_ep->tx_buf_pool, tx_buf);
err1:
	peer_info->seq_no--;
	ofi_ep_lock_release(&mrail_ep->util_ep);
	return ret;
}
示例#4
0
static ssize_t fi_ibv_cq_read(struct fid_cq *cq_fid, void *buf, size_t count)
{
	struct fi_ibv_cq *cq;
	struct fi_ibv_wce *wce;
	struct slist_entry *entry;
	struct ibv_wc wc;
	ssize_t ret = 0, i;

	cq = container_of(cq_fid, struct fi_ibv_cq, util_cq.cq_fid);

	cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);

	for (i = 0; i < count; i++) {
		if (!slist_empty(&cq->wcq)) {
			wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry);
			if (wce->wc.status) {
				ret = -FI_EAVAIL;
				break;
			}
			entry = slist_remove_head(&cq->wcq);
			wce = container_of(entry, struct fi_ibv_wce, entry);
			cq->read_entry(&wce->wc, (char *)buf + i * cq->entry_size);
			util_buf_release(cq->wce_pool, wce);
			continue;
		}

		ret = fi_ibv_poll_cq(cq, &wc);
		if (ret <= 0)
			break;

		/* Insert error entry into wcq */
		if (OFI_UNLIKELY(wc.status)) {
			if (wc.status == IBV_WC_WR_FLUSH_ERR) {
				/* Handle case when remote side destroys
				 * the connection, but local side isn't aware
				 * about that yet */
				VERBS_DBG(FI_LOG_CQ,
					  "Ignoring WC with status "
					  "IBV_WC_WR_FLUSH_ERR(%d)\n",
					  wc.status);
				i--;
				continue;
			}
			wce = util_buf_alloc(cq->wce_pool);
			if (!wce) {
				cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
				return -FI_ENOMEM;
			}
			memset(wce, 0, sizeof(*wce));
			memcpy(&wce->wc, &wc, sizeof wc);
			slist_insert_tail(&wce->entry, &cq->wcq);
			ret = -FI_EAVAIL;
			break;
		}

		cq->read_entry(&wc, (char *)buf + i * cq->entry_size);
	}
示例#5
0
void rxd_tx_pkt_release(struct rxd_pkt_meta *pkt_meta)
{
	if (RXD_PKT_IS_COMPLETE(pkt_meta)) {
		FI_DBG(&rxd_prov, FI_LOG_EP_CTRL, "Releasing buf: %p, num_out: %d\n",
		       pkt_meta, pkt_meta->ep->num_out);
		pkt_meta->ep->num_out--;
		util_buf_release(pkt_meta->ep->tx_pkt_pool, pkt_meta);
	}
}
示例#6
0
static int
util_mr_cache_create(struct ofi_mr_cache *cache, const struct iovec *iov,
		     uint64_t access, struct ofi_mr_entry **entry)
{
	int ret;

	FI_DBG(cache->domain->prov, FI_LOG_MR, "create %p (len: %" PRIu64 ")\n",
	       iov->iov_base, iov->iov_len);

	util_mr_cache_process_events(cache);

	*entry = util_buf_alloc(cache->entry_pool);
	if (OFI_UNLIKELY(!*entry))
		return -FI_ENOMEM;

	(*entry)->iov = *iov;
	(*entry)->use_cnt = 1;

	ret = cache->add_region(cache, *entry);
	if (ret) {
		while (ret && ofi_mr_cache_flush(cache)) {
			ret = cache->add_region(cache, *entry);
		}
		if (ret) {
			assert(!ofi_mr_cache_flush(cache));
			util_buf_release(cache->entry_pool, *entry);
			return ret;
		}
	}

	cache->cached_size += iov->iov_len;
	if ((++cache->cached_cnt > cache->max_cached_cnt) ||
	    (cache->cached_size > cache->max_cached_size)) {
		(*entry)->cached = 0;
	} else {
		if (cache->mr_storage.insert(&cache->mr_storage,
					     &(*entry)->iov, *entry)) {
			ret = -FI_ENOMEM;
			goto err;
		}
		(*entry)->cached = 1;

		ret = ofi_monitor_subscribe(&cache->nq, iov->iov_base, iov->iov_len,
					    &(*entry)->subscription);
		if (ret)
			goto err;
		(*entry)->subscribed = 1;
	}

	return 0;

err:
	util_mr_free_entry(cache, *entry);
	return ret;
}
示例#7
0
void pe_entry_release(struct tcpx_pe_entry *pe_entry)
{
	struct tcpx_domain *domain;

	domain = container_of(pe_entry->ep->util_ep.domain,
			      struct tcpx_domain, util_domain);

	memset(&pe_entry->msg_hdr, 0, sizeof(pe_entry->msg_hdr));
	dlist_remove(&pe_entry->entry);
	memset(pe_entry, 0, sizeof(*pe_entry));
	util_buf_release(domain->progress.pe_entry_pool, pe_entry);
}
示例#8
0
static int fi_ibv_reap_comp(struct fi_ibv_msg_ep *ep)
{
	struct fi_ibv_wce *wce = NULL;
	int got_wc = 0;
	int ret = 0;

	fastlock_acquire(&ep->scq->lock);
	while (ofi_atomic_get32(&ep->comp_pending) > 0) {
		if (!wce) {
			wce = util_buf_alloc(ep->scq->wce_pool);
			if (!wce) {
				fastlock_release(&ep->scq->lock);
				return -FI_ENOMEM;
			}
			memset(wce, 0, sizeof(*wce));
		}
		ret = fi_ibv_poll_cq(ep->scq, &wce->wc);
		if (ret < 0) {
			VERBS_WARN(FI_LOG_EP_DATA,
				   "Failed to read completion for signaled send\n");
			util_buf_release(ep->scq->wce_pool, wce);
			fastlock_release(&ep->scq->lock);
			return ret;
		} else if (ret > 0) {
			slist_insert_tail(&wce->entry, &ep->scq->wcq);
			got_wc = 1;
			wce = NULL;
		}
	}
	if (wce)
		util_buf_release(ep->scq->wce_pool, wce);

	if (got_wc && ep->scq->channel)
		ret = fi_ibv_cq_signal(&ep->scq->cq_fid);

	fastlock_release(&ep->scq->lock);
	return ret;
}
示例#9
0
static ssize_t
fi_ibv_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry,
		  uint64_t flags)
{
	struct fi_ibv_cq *cq;
	struct fi_ibv_wce *wce;
	struct slist_entry *slist_entry;
	uint32_t api_version;

	cq = container_of(cq_fid, struct fi_ibv_cq, util_cq.cq_fid);

	cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock);
	if (slist_empty(&cq->wcq))
		goto err;

	wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry);
	if (!wce->wc.status)
		goto err;

	api_version = cq->util_cq.domain->fabric->fabric_fid.api_version;

	slist_entry = slist_remove_head(&cq->wcq);
	cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);

	wce = container_of(slist_entry, struct fi_ibv_wce, entry);

	entry->op_context = (void *)(uintptr_t)wce->wc.wr_id;
	entry->err = EIO;
	entry->prov_errno = wce->wc.status;
	fi_ibv_handle_wc(&wce->wc, &entry->flags, &entry->len, &entry->data);

	if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5))) &&
		entry->err_data && entry->err_data_size) {
		entry->err_data_size = MIN(entry->err_data_size,
			sizeof(wce->wc.vendor_err));
		memcpy(entry->err_data, &wce->wc.vendor_err, entry->err_data_size);
	} else {
		memcpy(&entry->err_data, &wce->wc.vendor_err,
			sizeof(wce->wc.vendor_err));
	}

	util_buf_release(cq->wce_pool, wce);
	return 1;
err:
	cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock);
	return -FI_EAGAIN;
}
示例#10
0
static ssize_t fi_ibv_cq_read(struct fid_cq *cq_fid, void *buf, size_t count)
{
	struct fi_ibv_cq *cq;
	struct fi_ibv_wce *wce;
	struct slist_entry *entry;
	struct ibv_wc wc;
	ssize_t ret = 0, i;

	cq = container_of(cq_fid, struct fi_ibv_cq, cq_fid);

	fastlock_acquire(&cq->lock);

	for (i = 0; i < count; i++) {
		if (!slist_empty(&cq->wcq)) {
			wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry);
			if (wce->wc.status) {
				ret = -FI_EAVAIL;
				break;
			}
			entry = slist_remove_head(&cq->wcq);
			wce = container_of(entry, struct fi_ibv_wce, entry);
			cq->read_entry(&wce->wc, i, buf);
			util_buf_release(cq->domain->fab->wce_pool, wce);
			continue;
		}

		ret = fi_ibv_poll_cq(cq, &wc);
		if (ret <= 0)
			break;

		/* Insert error entry into wcq */
		if (wc.status) {
			wce = util_buf_alloc(cq->domain->fab->wce_pool);
			if (!wce) {
				fastlock_release(&cq->lock);
				return -FI_ENOMEM;
			}
			memset(wce, 0, sizeof(*wce));
			memcpy(&wce->wc, &wc, sizeof wc);
			slist_insert_tail(&wce->entry, &cq->wcq);
			ret = -FI_EAVAIL;
			break;
		}

		cq->read_entry(&wc, i, buf);
	}
示例#11
0
static void util_mr_free_entry(struct ofi_mr_cache *cache,
			       struct ofi_mr_entry *entry)
{
	FI_DBG(cache->domain->prov, FI_LOG_MR, "free %p (len: %" PRIu64 ")\n",
	       entry->iov.iov_base, entry->iov.iov_len);

	assert(!entry->cached);
	if (entry->subscribed) {
		ofi_monitor_unsubscribe(&entry->subscription);
		entry->subscribed = 0;
	}
	cache->delete_region(cache, entry);
	assert((cache->cached_cnt != 0) &&
	       (((ssize_t)cache->cached_size - (ssize_t)entry->iov.iov_len) >= 0));
	cache->cached_cnt--;
	cache->cached_size -= entry->iov.iov_len;

	util_buf_release(cache->entry_pool, entry);
}
示例#12
0
int rxm_ep_prepost_buf(struct rxm_ep *rxm_ep)
{
	struct rxm_rx_buf *rx_buf;
	int ret, i;

	for (i = 0; i < rxm_ep->rx_pool->chunk_cnt; i++) {
		rx_buf = util_buf_get(rxm_ep->rx_pool);
		rx_buf->ctx_type = RXM_RX_BUF;
		rx_buf->ep = rxm_ep;

		ret = rxm_ep_repost_buf(rx_buf);
		if (ret) {
			util_buf_release(rxm_ep->rx_pool, rx_buf);
			return ret;
		}
		slist_insert_tail(&rx_buf->entry, &rxm_ep->rx_buf_list);
	}
	return 0;
}
示例#13
0
static void rxm_ep_txrx_res_close(struct rxm_ep *rxm_ep)
{
	struct slist_entry *entry;
	struct rxm_rx_buf *rx_buf;

	rxm_recv_queue_close(&rxm_ep->trecv_queue);
	rxm_recv_queue_close(&rxm_ep->recv_queue);

	if (rxm_ep->txe_fs)
		rxm_txe_fs_free(rxm_ep->txe_fs);

	while(!slist_empty(&rxm_ep->rx_buf_list)) {
		entry = slist_remove_head(&rxm_ep->rx_buf_list);
		rx_buf = container_of(entry, struct rxm_rx_buf, entry);
		util_buf_release(rxm_ep->rx_pool, rx_buf);
	}

	util_buf_pool_destroy(rxm_ep->rx_pool);
	util_buf_pool_destroy(rxm_ep->tx_pool);
}
示例#14
0
static int process_srx_entry(struct tcpx_xfer_entry *rx_entry)
{
	int ret;

	ret = tcpx_recv_msg_data(rx_entry);
	if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret))
		return ret;

	if (ret) {
		FI_WARN(&tcpx_prov, FI_LOG_DOMAIN,
			"msg recv Failed ret = %d\n", ret);

		tcpx_ep_shutdown_report(rx_entry->ep,
					&rx_entry->ep->util_ep.ep_fid.fid);
	}

	if ((ntohl(rx_entry->msg_hdr.hdr.flags) &
	     OFI_DELIVERY_COMPLETE) && !ret) {
		if (tcpx_prepare_rx_entry_resp(rx_entry))
			rx_entry->ep->cur_rx_proc_fn = tcpx_prepare_rx_entry_resp;

		return FI_SUCCESS;
	}

	tcpx_cq_report_completion(rx_entry->ep->util_ep.rx_cq,
				  rx_entry, -ret);

	/* release the shared entry */
	if (rx_entry->ep->cur_rx_entry == rx_entry) {
		rx_entry->ep->cur_rx_entry = NULL;
	}

	fastlock_acquire(&rx_entry->ep->srx_ctx->lock);
	util_buf_release(rx_entry->ep->srx_ctx->buf_pool, rx_entry);
	fastlock_release(&rx_entry->ep->srx_ctx->lock);
	return FI_SUCCESS;
}
示例#15
0
static ssize_t
fi_ibv_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry,
		  uint64_t flags)
{
	struct fi_ibv_cq *cq;
	struct fi_ibv_wce *wce;
	struct slist_entry *slist_entry;

	cq = container_of(cq_fid, struct fi_ibv_cq, cq_fid);

	fastlock_acquire(&cq->lock);
	if (slist_empty(&cq->wcq))
		goto err;

	wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry);
	if (!wce->wc.status)
		goto err;

	slist_entry = slist_remove_head(&cq->wcq);
	fastlock_release(&cq->lock);

	wce = container_of(slist_entry, struct fi_ibv_wce, entry);

	entry->op_context = (void *) (uintptr_t) wce->wc.wr_id;
	entry->flags = 0;
	entry->err = EIO;
	entry->prov_errno = wce->wc.status;
	memcpy(&entry->err_data, &wce->wc.vendor_err,
	       sizeof(wce->wc.vendor_err));

	util_buf_release(cq->domain->fab->wce_pool, wce);
	return sizeof(*entry);
err:
	fastlock_release(&cq->lock);
	return -FI_EAGAIN;
}
示例#16
0
void rxd_tx_pkt_free(struct rxd_pkt_meta *pkt_meta)
{
	util_buf_release(pkt_meta->ep->tx_pkt_pool, pkt_meta);
}
示例#17
0
static ssize_t
fi_ibv_rdm_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
		uint64_t flags)
{
	struct fi_ibv_rdm_ep *ep =
		container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid);

	struct fi_ibv_rdm_conn *conn = ep->av->addr_to_conn(ep, msg->addr);

	struct fi_ibv_rdm_rma_start_data start_data = {
		.ep_rdm = ep,
		.conn = conn,
		.context = msg->context,
		.flags = FI_RMA | FI_READ | (ep->tx_selective_completion ?
			(flags & FI_COMPLETION) : FI_COMPLETION),
		.data_len = (uint64_t)msg->msg_iov[0].iov_len,
		.rbuf = (uintptr_t)msg->rma_iov[0].addr,
		.lbuf = (uintptr_t)msg->msg_iov[0].iov_base,
		.rkey = (uint64_t)(uintptr_t)(msg->rma_iov[0].key),
		.lkey = (uint64_t)(uintptr_t)(msg->desc ? msg->desc[0] : NULL),
		.op_code = IBV_WR_RDMA_READ
	};

	struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep };

	struct fi_ibv_rdm_buf *rdm_buf = NULL;
	ssize_t ret = FI_SUCCESS;

	if(msg->iov_count != 1 || msg->rma_iov_count != 1) {
		assert(0);
		return -FI_EMSGSIZE;
	}

	ret = fi_ibv_rdm_ep_rma_preinit((void**)&start_data.lkey, &rdm_buf,
					msg->msg_iov[0].iov_len,
					conn, ep);
	if (ret) {
		return ret;
	}

	struct fi_ibv_rdm_request *request =
		util_buf_alloc(fi_ibv_rdm_request_pool);
	FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG);

	/* Initial state */
	request->state.eager = FI_IBV_STATE_EAGER_BEGIN;
	request->state.rndv  = FI_IBV_STATE_RNDV_NOT_USED;
	request->state.err   = FI_SUCCESS;

	request->minfo.is_tagged = 0;
	request->rmabuf = rdm_buf;

	fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data);

	return fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_POST_READY,
				   &post_ready_data);
}

static ssize_t
fi_ibv_rdm_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc,
		size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key,
		void *context)
{
	struct fi_ibv_rdm_ep *ep_rdm = 
		container_of(ep, struct fi_ibv_rdm_ep, ep_fid);

	struct fi_rma_iov rma_iov = {
		.addr = addr,
		.len = 0,
		.key = key
	};

	struct fi_msg_rma msg = {
		.msg_iov = iov,
		.desc = desc,
		.iov_count = count,
		.addr = src_addr,
		.rma_iov = &rma_iov,
		.rma_iov_count = 1,
		.context = context,
		.data = 0
	};

	size_t i;
	for (i = 0; i < count; i++) {
		rma_iov.len += iov[i].iov_len;
	}

	return fi_ibv_rdm_ep_rma_readmsg(ep, &msg,
		(ep_rdm->tx_selective_completion ? 0ULL : FI_COMPLETION));
}

static ssize_t
fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len,
		    void *desc, fi_addr_t src_addr,
		    uint64_t addr, uint64_t key, void *context)
{
	const struct iovec iov = {
		.iov_base = buf,
		.iov_len = len
	};

	return fi_ibv_rdm_ep_rma_readv(ep_fid, &iov, &desc, 1, src_addr, addr,
					key, context);
}

static ssize_t
fi_ibv_rdm_ep_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg,
		uint64_t flags)
{
	struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep,
						ep_fid);
	struct fi_ibv_rdm_conn *conn = ep->av->addr_to_conn(ep, msg->addr);
	struct fi_ibv_rdm_request *request = NULL;
	struct fi_ibv_rdm_buf *rdm_buf = NULL;
	ssize_t ret = FI_SUCCESS;

	struct fi_ibv_rdm_rma_start_data start_data = {
		.conn = conn,
		.ep_rdm = ep,
		.context = msg->context,
		.flags = FI_RMA | FI_WRITE | (ep->tx_selective_completion ?
			(flags & FI_COMPLETION) : FI_COMPLETION),
		.data_len = (uint64_t)msg->msg_iov[0].iov_len,
		.rbuf = msg->rma_iov[0].addr,
		.lbuf = (uintptr_t)msg->msg_iov[0].iov_base,
		.rkey = msg->rma_iov[0].key,
		.lkey = (uint64_t)(uintptr_t)(msg->desc ? msg->desc[0] : NULL),
		.op_code = IBV_WR_RDMA_WRITE
	};

	if(msg->iov_count != 1 && msg->rma_iov_count != 1) {
		assert(0);
		return -FI_EMSGSIZE;
	}

	ret = fi_ibv_rdm_ep_rma_preinit((void**)&start_data.lkey, &rdm_buf,
					msg->msg_iov[0].iov_len,
					conn, ep);
	if (ret) {
		return ret;
	}

	request = util_buf_alloc(fi_ibv_rdm_request_pool);
	FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG);

	/* Initial state */
	request->state.eager = FI_IBV_STATE_EAGER_BEGIN;
	request->state.rndv  = FI_IBV_STATE_RNDV_NOT_USED;
	request->state.err   = FI_SUCCESS;

	request->minfo.is_tagged = 0;
	request->rmabuf = rdm_buf;

	fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data);

	struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep };

	return fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_POST_READY,
				   &post_ready_data);
}

static ssize_t
fi_ibv_rdm_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc,
		size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key,
		void *context)
{
	struct fi_rma_iov rma_iov = {
		.addr = addr,
		.len = 0,
		.key = key
	};

	struct fi_msg_rma msg = {
		.msg_iov = iov,
		.desc = desc,
		.iov_count = count,
		.addr = dest_addr,
		.rma_iov = &rma_iov,
		.rma_iov_count = 1,
		.context = context,
		.data = 0
	};

	size_t i;
	for (i = 0; i < count; i++) {
		rma_iov.len += iov[i].iov_len;
	}

	struct fi_ibv_rdm_ep *ep_rdm =
		container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid);

	return fi_ibv_rdm_ep_rma_writemsg(ep_fid, &msg,
		(ep_rdm->tx_selective_completion ? 0ULL : FI_COMPLETION));
}

static ssize_t
fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len,
			void *desc, fi_addr_t dest_addr, uint64_t addr,
			uint64_t key, void *context)
{
	const struct iovec iov = {
		.iov_base = (void *)buf,
		.iov_len = len
	};

	return fi_ibv_rdm_ep_rma_writev(ep_fid, &iov, &desc, 1, dest_addr, addr,
					key, context);
}

static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep,
					      const void *buf, size_t len,
					      fi_addr_t dest_addr,
					      uint64_t addr, uint64_t key)
{
	struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep,
						    ep_fid);
	struct fi_ibv_rdm_conn *conn = ep_rdm->av->addr_to_conn(ep_rdm, dest_addr);
	struct fi_ibv_rdm_rma_start_data start_data = {
		.conn = conn,
		.ep_rdm = ep_rdm,
		.flags = 0, /* inject does not generate completion */
		.data_len = (uint64_t)len,
		.rbuf = addr,
		.lbuf = (uintptr_t)buf,
		.rkey = (uint64_t)key,
		.lkey = 0
	};
	struct fi_ibv_rdm_request *request =
		util_buf_alloc(fi_ibv_rdm_request_pool);
	ssize_t ret;

	FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG);

	/* Initial state */
	request->state.eager = FI_IBV_STATE_EAGER_RMA_INJECT;
	request->state.rndv  = FI_IBV_STATE_RNDV_NOT_USED;
	request->state.err   = FI_SUCCESS;

	request->minfo.is_tagged = 0;
	ret = fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data);

	switch (ret)
	{
	case FI_SUCCESS:
		return ret;
	case -FI_EAGAIN:
		break;
	default:
		ret = -errno;
		break;
	}

	FI_IBV_RDM_DBG_REQUEST("to_pool: ", request, FI_LOG_DEBUG);
	util_buf_release(fi_ibv_rdm_request_pool, request);

	fi_ibv_rdm_tagged_poll(ep_rdm);

	return ret;
}

static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = {
	.size		= sizeof(struct fi_ops_rma),
	.read		= fi_ibv_rdm_ep_rma_read,
	.readv		= fi_ibv_rdm_ep_rma_readv,
	.readmsg	= fi_ibv_rdm_ep_rma_readmsg,
	.write		= fi_ibv_rdm_ep_rma_write,
	.writev		= fi_ibv_rdm_ep_rma_writev,
	.writemsg	= fi_ibv_rdm_ep_rma_writemsg,
	.inject		= fi_ibv_rdm_ep_rma_inject_write,
	.writedata	= fi_no_rma_writedata,
	.injectdata	= fi_no_rma_injectdata,
};

struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma()
{
	return &fi_ibv_rdm_ep_rma_ops;
}
示例#18
0
static ssize_t
fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len,
		    void *desc, fi_addr_t src_addr,
		    uint64_t addr, uint64_t key, void *context)
{
	ssize_t ret = FI_SUCCESS;
	struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep,
						ep_fid);

	if (desc == NULL && len >= ep->rndv_threshold) {
		goto out_errinput;
	}

	struct fi_ibv_rdm_tagged_conn *conn =
		(struct fi_ibv_rdm_tagged_conn *) src_addr;
	void *raw_buf = NULL;

	if (desc == NULL) {
		int again = 1;

		if (!conn->postponed_entry) {
			raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep);

			if (raw_buf) {
				desc = (void*)(uintptr_t)conn->rma_mr->lkey;
				again = 0;
			}
		}

		if (again) {
			goto out_again;
		}
	} else if (!fi_ibv_rdm_check_connection(conn, ep) ||
		   RMA_RESOURCES_IS_BUSY(conn, ep)) {
		/*
		 * TODO: Should be postponed queue flow for RMA be implemented?
		 */
		goto out_again;
	}

	struct fi_ibv_rdm_tagged_request *request = 
		util_buf_alloc(fi_ibv_rdm_tagged_request_pool);
	FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG);

	/* Initial state */
	request->state.eager = FI_IBV_STATE_EAGER_BEGIN;
	request->state.rndv  = FI_IBV_STATE_RNDV_NOT_USED;
	request->rmabuf = raw_buf;

	struct fi_ibv_rdm_rma_start_data start_data = {
		.ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid),
		.conn = (struct fi_ibv_rdm_tagged_conn *) src_addr,
		.context = context,
		.data_len = (uint32_t)len,
		.rbuf = addr,
		.lbuf = (uintptr_t)buf,
		.rkey = (uint32_t)key,
		.lkey = (uint32_t)(uintptr_t)desc,
		.op_code = IBV_WR_RDMA_READ
	};

	fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data);

	struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep };

	ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY,
					 &post_ready_data);

out:
	return ret;

out_again:
	fi_ibv_rdm_tagged_poll(ep);
	ret = -FI_EAGAIN;
	goto out;

out_errinput:
	ret = -FI_EINVAL;
	goto out;
}

static ssize_t
fi_ibv_rdm_ep_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg,
		uint64_t flags)
{
	if(msg->iov_count == 1 && msg->rma_iov_count == 1) {
		return fi_ibv_rdm_ep_rma_read(ep,
					      msg->msg_iov[0].iov_base,
					      msg->msg_iov[0].iov_len,
					      msg->desc[0],
					      msg->addr,
					      msg->rma_iov[0].addr,
					      msg->rma_iov[0].key,
					      msg->context);
	}

	assert(0);
	return -FI_EMSGSIZE;
}

static ssize_t
fi_ibv_rdm_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc,
		size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key,
		void *context)
{
	struct fi_rma_iov rma_iov = {
		.addr = src_addr,
		.len = 0,
		.key = key
	};

	size_t i;
	for (i = 0; i < count; i++) {
		rma_iov.len += iov[i].iov_len;
	}

	struct fi_msg_rma msg = {
		.msg_iov = iov,
		.desc = desc,
		.iov_count = count,
		.addr = addr,
		.rma_iov = &rma_iov,
		.rma_iov_count = 1,
		.context = context,
		.data = 0
	};

	return fi_ibv_rdm_ep_rma_readmsg(ep, &msg, 0);
}

static ssize_t
fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len,
		     void *desc, fi_addr_t dest_addr,
		     uint64_t addr, uint64_t key, void *context)
{
	ssize_t ret = FI_SUCCESS;
	struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep,
						ep_fid);

	if (desc == NULL && len >= ep->rndv_threshold) {
		goto out_errinput;
	}

	struct fi_ibv_rdm_tagged_conn *conn =
		(struct fi_ibv_rdm_tagged_conn *) dest_addr;
	void *raw_buf = NULL;

	if (desc == NULL) {
		int again = 1;

		if (!conn->postponed_entry) {
			raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep);

			if (raw_buf) {
				memcpy (raw_buf, buf, len);
				desc = (void*)(uintptr_t)conn->rma_mr->lkey;
				again = 0;
			}
		}

		if (again) {
			goto out_again;
		}
	} else if (!fi_ibv_rdm_check_connection(conn, ep) ||
		   SEND_RESOURCES_IS_BUSY(conn, ep)) {
		/*
		 * TODO: Should be postponed queue flow for RMA be implemented?
		 */
		goto out_again;
	}

	struct fi_ibv_rdm_tagged_request *request = 
		util_buf_alloc(fi_ibv_rdm_tagged_request_pool);
	FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG);

	/* Initial state */
	request->state.eager = FI_IBV_STATE_EAGER_BEGIN;
	request->state.rndv  = FI_IBV_STATE_RNDV_NOT_USED;
	request->rmabuf = raw_buf;

	struct fi_ibv_rdm_rma_start_data start_data = {
		.conn = conn,
		.ep_rdm = ep,
		.context = context,
		.data_len = (uint32_t)len,
		.rbuf = addr,
		.lbuf = (uintptr_t)buf,
		.rkey = (uint32_t)key,
		.lkey = (uint32_t)(uintptr_t)desc,
		.op_code = IBV_WR_RDMA_WRITE
	};

	fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data);

	struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep };

	ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY,
					 &post_ready_data);
	ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER;
out:
	return ret;

out_again:
	fi_ibv_rdm_tagged_poll(ep);
	ret = -FI_EAGAIN;
	goto out;

out_errinput:
	ret = -FI_EINVAL;
	goto out;
}

static ssize_t
fi_ibv_rdm_ep_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg,
		uint64_t flags)
{
	if(msg->iov_count == 1 && msg->rma_iov_count == 1) {
		return fi_ibv_rdm_ep_rma_write(ep,
					       msg->msg_iov[0].iov_base,
					       msg->msg_iov[0].iov_len,
					       msg->desc[0],
					       msg->addr,
					       msg->rma_iov[0].addr,
					       msg->rma_iov[0].key,
					       msg->context);
	}

	assert(0);
	return -FI_EMSGSIZE;
}

static ssize_t
fi_ibv_rdm_ep_rma_writev(struct fid_ep *ep, const struct iovec *iov, void **desc,
		size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key,
		void *context)
{
	struct fi_rma_iov rma_iov = {
		.addr = dest_addr,
		.len = 0,
		.key = key
	};

	size_t i;
	for (i = 0; i < count; i++) {
		rma_iov.len += iov[i].iov_len;
	}

	struct fi_msg_rma msg = {
		.msg_iov = iov,
		.desc = desc,
		.iov_count = count,
		.addr = addr,
		.rma_iov = &rma_iov,
		.rma_iov_count = 1,
		.context = context,
		.data = 0
	};

	return fi_ibv_rdm_ep_rma_writemsg(ep, &msg, 0);
}

static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep,
					      const void *buf, size_t len,
					      fi_addr_t dest_addr,
					      uint64_t addr, uint64_t key)
{
	struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep,
						    ep_fid);
	struct fi_ibv_rdm_tagged_conn *conn =
		(struct fi_ibv_rdm_tagged_conn *) dest_addr;
	struct fi_ibv_rdm_tagged_request *request = NULL;
	int ret = FI_EP_RDM_HNDL_AGAIN;

	if (len >= ep_rdm->rndv_threshold) {
		return -FI_EMSGSIZE;
	}

	if (fi_ibv_rdm_check_connection(conn, ep_rdm) &&
	    !RMA_RESOURCES_IS_BUSY(conn, ep_rdm) &&
	    !conn->postponed_entry)
	{
		request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool);

		FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ",
			request, FI_LOG_DEBUG);

		/* Initial state */
		request->state.eager = FI_IBV_STATE_EAGER_RMA_INJECT;
		request->state.rndv  = FI_IBV_STATE_RNDV_NOT_USED;

		struct fi_ibv_rdm_rma_start_data start_data = {
			.conn = conn,
			.ep_rdm = ep_rdm,
			.data_len = (uint32_t)len,
			.rbuf = addr,
			.lbuf = (uintptr_t)buf,
			.rkey = (uint32_t)key,
			.lkey = 0

		};

		ret =  fi_ibv_rdm_tagged_req_hndl(request,
						  FI_IBV_EVENT_RMA_START,
						  &start_data);
	}

	switch (ret)
	{
	case FI_EP_RDM_HNDL_SUCCESS:
		return ret;
	case FI_EP_RDM_HNDL_AGAIN:
		ret = -FI_EAGAIN;
		break;
	default:
		ret = -errno;
		break;
	}

	if (request) {
		FI_IBV_RDM_TAGGED_DBG_REQUEST("to_pool: ", request,
					      FI_LOG_DEBUG);
		util_buf_release(fi_ibv_rdm_tagged_request_pool, request);
	}

	fi_ibv_rdm_tagged_poll(ep_rdm);

	return ret;
}

static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = {
	.size		= sizeof(struct fi_ops_rma),
	.read		= fi_ibv_rdm_ep_rma_read,
	.readv		= fi_ibv_rdm_ep_rma_readv,
	.readmsg	= fi_ibv_rdm_ep_rma_readmsg,
	.write		= fi_ibv_rdm_ep_rma_write,
	.writev		= fi_ibv_rdm_ep_rma_writev,
	.writemsg	= fi_ibv_rdm_ep_rma_writemsg,
	.inject		= fi_ibv_rdm_ep_rma_inject_write,
	.writedata	= fi_no_rma_writedata,
	.injectdata	= fi_no_rma_injectdata,
};

struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma(struct fi_ibv_rdm_ep *ep)
{
	return &fi_ibv_rdm_ep_rma_ops;
}
示例#19
0
// TODO handle all flags
static ssize_t rxm_ep_send_common(struct fid_ep *ep_fid, const struct iovec *iov,
		void **desc, size_t count, fi_addr_t dest_addr, void *context,
		uint64_t data, uint64_t tag, uint64_t flags, int op)
{
	struct rxm_ep *rxm_ep;
	struct rxm_conn *rxm_conn;
	struct rxm_tx_entry *tx_entry;
	struct rxm_pkt *pkt;
	struct fid_mr *mr;
	void *desc_tx_buf = NULL;
	struct rxm_rma_iov *rma_iov;
	int pkt_size = 0;
	int i, ret;

	rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid);

	ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn);
	if (ret)
		return ret;

	if (freestack_isempty(rxm_ep->txe_fs)) {
		FI_DBG(&rxm_prov, FI_LOG_CQ, "Exhaused tx_entry freestack\n");
		return -FI_ENOMEM;
	}

	tx_entry = freestack_pop(rxm_ep->txe_fs);

	tx_entry->ctx_type = RXM_TX_ENTRY;
	tx_entry->ep = rxm_ep;
	tx_entry->context = context;
	tx_entry->flags = flags;

	if (rxm_ep->msg_info->mode & FI_LOCAL_MR) {
		pkt = util_buf_get_ex(rxm_ep->tx_pool, (void **)&mr);
		desc_tx_buf = fi_mr_desc(mr);
	} else {
		pkt = util_buf_get(rxm_ep->tx_pool);
	}
	assert(pkt);

	tx_entry->pkt = pkt;

	rxm_pkt_init(pkt);
	pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key;
	pkt->hdr.op = op;
	pkt->hdr.size = ofi_get_iov_len(iov, count);
	rxm_op_hdr_process_flags(&pkt->hdr, flags, data);

	if (op == ofi_op_tagged)
		pkt->hdr.tag = tag;

	if (pkt->hdr.size > RXM_TX_DATA_SIZE) {
		if (flags & FI_INJECT) {
			FI_WARN(&rxm_prov, FI_LOG_EP_DATA,
					"inject size supported: %d, msg size: %d\n",
					rxm_tx_attr.inject_size,
					pkt->hdr.size);
			ret = -FI_EMSGSIZE;
			goto err;
		}
		tx_entry->msg_id = ofi_idx2key(&rxm_ep->tx_key_idx,
				rxm_txe_fs_index(rxm_ep->txe_fs, tx_entry));
		pkt->ctrl_hdr.msg_id = tx_entry->msg_id;
		pkt->ctrl_hdr.type = ofi_ctrl_large_data;
		rma_iov = (struct rxm_rma_iov *)pkt->data;
		rma_iov->count = count;
		for (i = 0; i < count; i++) {
			rma_iov->iov[i].addr = rxm_ep->msg_info->domain_attr->mr_mode == FI_MR_SCALABLE ?
				0 : (uintptr_t)iov->iov_base;
			rma_iov->iov[i].len = (uint64_t)iov->iov_len;
			rma_iov->iov[i].key = fi_mr_key(desc[i]);
		}
		pkt_size = sizeof(*pkt) + sizeof(*rma_iov) + sizeof(*rma_iov->iov) * count;
		FI_DBG(&rxm_prov, FI_LOG_CQ,
				"Sending large msg. msg_id: 0x%" PRIx64 "\n",
				tx_entry->msg_id);
		FI_DBG(&rxm_prov, FI_LOG_CQ, "tx_entry->state -> RXM_LMT_START\n");
		tx_entry->state = RXM_LMT_START;
	} else {
		pkt->ctrl_hdr.type = ofi_ctrl_data;
		ofi_copy_iov_buf(iov, count, pkt->data, pkt->hdr.size, 0,
				OFI_COPY_IOV_TO_BUF);
		pkt_size = sizeof(*pkt) + pkt->hdr.size;
	}

	ret = fi_send(rxm_conn->msg_ep, pkt, pkt_size, desc_tx_buf, 0, tx_entry);
	if (ret) {
		FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "fi_send for MSG provider failed\n");
		goto err;
	}
	return 0;
err:
	util_buf_release(rxm_ep->tx_pool, pkt);
	freestack_push(rxm_ep->txe_fs, tx_entry);
	return ret;
}