/* Must call with cq->lock held */ ssize_t fi_ibv_poll_cq(struct fi_ibv_cq *cq, struct ibv_wc *wc) { struct fi_ibv_msg_epe *epe; struct slist_entry *entry; ssize_t ret; ret = ibv_poll_cq(cq->cq, 1, wc); if (ret <= 0) return ret; if (wc->opcode == IBV_WC_RECV || wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) return ret; /* TODO Handle the case when app posts a send with same wr_id */ if ((wc->wr_id & cq->wr_id_mask) != cq->send_signal_wr_id) return ret; entry = slist_remove_first_match(&cq->ep_list, fi_ibv_match_ep_id, (void *)wc->wr_id); if (!entry) { FI_WARN(&fi_ibv_prov, FI_LOG_CQ, "No matching EP for :" "given signaled send completion\n"); return -FI_EOTHER; } epe = container_of(entry, struct fi_ibv_msg_epe, entry); atomic_sub(&epe->ep->unsignaled_send_cnt, VERBS_SEND_SIGNAL_THRESH(epe->ep)); atomic_dec(&epe->ep->comp_pending); util_buf_release(cq->domain->fab->epe_pool, epe); return 0; }
static inline void rxd_release_unexp_entry(struct rxd_cq *cq, struct fi_cq_msg_entry *comp) { struct rxd_unexp_cq_entry *unexp; unexp = container_of(comp, struct rxd_unexp_cq_entry, cq_entry); dlist_remove(&unexp->entry); util_buf_release(cq->unexp_pool, unexp); }
static ssize_t mrail_tsend_common(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, size_t len, fi_addr_t dest_addr, uint64_t tag, uint64_t data, void *context, uint64_t flags) { struct mrail_ep *mrail_ep = container_of(ep_fid, struct mrail_ep, util_ep.ep_fid.fid); struct mrail_peer_info *peer_info; struct iovec *iov_dest = alloca(sizeof(*iov_dest) * (count + 1)); struct mrail_tx_buf *tx_buf; uint32_t i = mrail_get_tx_rail(mrail_ep); struct fi_msg msg; ssize_t ret; peer_info = ofi_av_get_addr(mrail_ep->util_ep.av, (int) dest_addr); ofi_ep_lock_acquire(&mrail_ep->util_ep); tx_buf = mrail_get_tx_buf(mrail_ep, context, peer_info->seq_no++, ofi_op_tagged, flags | FI_TAGGED); if (OFI_UNLIKELY(!tx_buf)) { ret = -FI_ENOMEM; goto err1; } tx_buf->hdr.tag = tag; mrail_copy_iov_hdr(&tx_buf->hdr, iov_dest, iov, count); msg.msg_iov = iov_dest; msg.desc = desc; msg.iov_count = count + 1; msg.addr = dest_addr; msg.context = tx_buf; msg.data = data; if (len < mrail_ep->rails[i].info->tx_attr->inject_size) flags |= FI_INJECT; FI_DBG(&mrail_prov, FI_LOG_EP_DATA, "Posting tsend of length: %" PRIu64 " dest_addr: 0x%" PRIx64 " tag: 0x%" PRIx64 " seq: %d" " on rail: %d\n", len, dest_addr, tag, peer_info->seq_no - 1, i); ret = fi_sendmsg(mrail_ep->rails[i].ep, &msg, flags); if (ret) { FI_WARN(&mrail_prov, FI_LOG_EP_DATA, "Unable to fi_sendmsg on rail: %" PRIu32 "\n", i); goto err2; } else if (!(flags & FI_COMPLETION)) { ofi_ep_tx_cntr_inc(&mrail_ep->util_ep); } ofi_ep_lock_release(&mrail_ep->util_ep); return ret; err2: util_buf_release(mrail_ep->tx_buf_pool, tx_buf); err1: peer_info->seq_no--; ofi_ep_lock_release(&mrail_ep->util_ep); return ret; }
static ssize_t fi_ibv_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) { struct fi_ibv_cq *cq; struct fi_ibv_wce *wce; struct slist_entry *entry; struct ibv_wc wc; ssize_t ret = 0, i; cq = container_of(cq_fid, struct fi_ibv_cq, util_cq.cq_fid); cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); for (i = 0; i < count; i++) { if (!slist_empty(&cq->wcq)) { wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry); if (wce->wc.status) { ret = -FI_EAVAIL; break; } entry = slist_remove_head(&cq->wcq); wce = container_of(entry, struct fi_ibv_wce, entry); cq->read_entry(&wce->wc, (char *)buf + i * cq->entry_size); util_buf_release(cq->wce_pool, wce); continue; } ret = fi_ibv_poll_cq(cq, &wc); if (ret <= 0) break; /* Insert error entry into wcq */ if (OFI_UNLIKELY(wc.status)) { if (wc.status == IBV_WC_WR_FLUSH_ERR) { /* Handle case when remote side destroys * the connection, but local side isn't aware * about that yet */ VERBS_DBG(FI_LOG_CQ, "Ignoring WC with status " "IBV_WC_WR_FLUSH_ERR(%d)\n", wc.status); i--; continue; } wce = util_buf_alloc(cq->wce_pool); if (!wce) { cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); return -FI_ENOMEM; } memset(wce, 0, sizeof(*wce)); memcpy(&wce->wc, &wc, sizeof wc); slist_insert_tail(&wce->entry, &cq->wcq); ret = -FI_EAVAIL; break; } cq->read_entry(&wc, (char *)buf + i * cq->entry_size); }
void rxd_tx_pkt_release(struct rxd_pkt_meta *pkt_meta) { if (RXD_PKT_IS_COMPLETE(pkt_meta)) { FI_DBG(&rxd_prov, FI_LOG_EP_CTRL, "Releasing buf: %p, num_out: %d\n", pkt_meta, pkt_meta->ep->num_out); pkt_meta->ep->num_out--; util_buf_release(pkt_meta->ep->tx_pkt_pool, pkt_meta); } }
static int util_mr_cache_create(struct ofi_mr_cache *cache, const struct iovec *iov, uint64_t access, struct ofi_mr_entry **entry) { int ret; FI_DBG(cache->domain->prov, FI_LOG_MR, "create %p (len: %" PRIu64 ")\n", iov->iov_base, iov->iov_len); util_mr_cache_process_events(cache); *entry = util_buf_alloc(cache->entry_pool); if (OFI_UNLIKELY(!*entry)) return -FI_ENOMEM; (*entry)->iov = *iov; (*entry)->use_cnt = 1; ret = cache->add_region(cache, *entry); if (ret) { while (ret && ofi_mr_cache_flush(cache)) { ret = cache->add_region(cache, *entry); } if (ret) { assert(!ofi_mr_cache_flush(cache)); util_buf_release(cache->entry_pool, *entry); return ret; } } cache->cached_size += iov->iov_len; if ((++cache->cached_cnt > cache->max_cached_cnt) || (cache->cached_size > cache->max_cached_size)) { (*entry)->cached = 0; } else { if (cache->mr_storage.insert(&cache->mr_storage, &(*entry)->iov, *entry)) { ret = -FI_ENOMEM; goto err; } (*entry)->cached = 1; ret = ofi_monitor_subscribe(&cache->nq, iov->iov_base, iov->iov_len, &(*entry)->subscription); if (ret) goto err; (*entry)->subscribed = 1; } return 0; err: util_mr_free_entry(cache, *entry); return ret; }
void pe_entry_release(struct tcpx_pe_entry *pe_entry) { struct tcpx_domain *domain; domain = container_of(pe_entry->ep->util_ep.domain, struct tcpx_domain, util_domain); memset(&pe_entry->msg_hdr, 0, sizeof(pe_entry->msg_hdr)); dlist_remove(&pe_entry->entry); memset(pe_entry, 0, sizeof(*pe_entry)); util_buf_release(domain->progress.pe_entry_pool, pe_entry); }
static int fi_ibv_reap_comp(struct fi_ibv_msg_ep *ep) { struct fi_ibv_wce *wce = NULL; int got_wc = 0; int ret = 0; fastlock_acquire(&ep->scq->lock); while (ofi_atomic_get32(&ep->comp_pending) > 0) { if (!wce) { wce = util_buf_alloc(ep->scq->wce_pool); if (!wce) { fastlock_release(&ep->scq->lock); return -FI_ENOMEM; } memset(wce, 0, sizeof(*wce)); } ret = fi_ibv_poll_cq(ep->scq, &wce->wc); if (ret < 0) { VERBS_WARN(FI_LOG_EP_DATA, "Failed to read completion for signaled send\n"); util_buf_release(ep->scq->wce_pool, wce); fastlock_release(&ep->scq->lock); return ret; } else if (ret > 0) { slist_insert_tail(&wce->entry, &ep->scq->wcq); got_wc = 1; wce = NULL; } } if (wce) util_buf_release(ep->scq->wce_pool, wce); if (got_wc && ep->scq->channel) ret = fi_ibv_cq_signal(&ep->scq->cq_fid); fastlock_release(&ep->scq->lock); return ret; }
static ssize_t fi_ibv_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, uint64_t flags) { struct fi_ibv_cq *cq; struct fi_ibv_wce *wce; struct slist_entry *slist_entry; uint32_t api_version; cq = container_of(cq_fid, struct fi_ibv_cq, util_cq.cq_fid); cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); if (slist_empty(&cq->wcq)) goto err; wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry); if (!wce->wc.status) goto err; api_version = cq->util_cq.domain->fabric->fabric_fid.api_version; slist_entry = slist_remove_head(&cq->wcq); cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); wce = container_of(slist_entry, struct fi_ibv_wce, entry); entry->op_context = (void *)(uintptr_t)wce->wc.wr_id; entry->err = EIO; entry->prov_errno = wce->wc.status; fi_ibv_handle_wc(&wce->wc, &entry->flags, &entry->len, &entry->data); if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5))) && entry->err_data && entry->err_data_size) { entry->err_data_size = MIN(entry->err_data_size, sizeof(wce->wc.vendor_err)); memcpy(entry->err_data, &wce->wc.vendor_err, entry->err_data_size); } else { memcpy(&entry->err_data, &wce->wc.vendor_err, sizeof(wce->wc.vendor_err)); } util_buf_release(cq->wce_pool, wce); return 1; err: cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); return -FI_EAGAIN; }
static ssize_t fi_ibv_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) { struct fi_ibv_cq *cq; struct fi_ibv_wce *wce; struct slist_entry *entry; struct ibv_wc wc; ssize_t ret = 0, i; cq = container_of(cq_fid, struct fi_ibv_cq, cq_fid); fastlock_acquire(&cq->lock); for (i = 0; i < count; i++) { if (!slist_empty(&cq->wcq)) { wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry); if (wce->wc.status) { ret = -FI_EAVAIL; break; } entry = slist_remove_head(&cq->wcq); wce = container_of(entry, struct fi_ibv_wce, entry); cq->read_entry(&wce->wc, i, buf); util_buf_release(cq->domain->fab->wce_pool, wce); continue; } ret = fi_ibv_poll_cq(cq, &wc); if (ret <= 0) break; /* Insert error entry into wcq */ if (wc.status) { wce = util_buf_alloc(cq->domain->fab->wce_pool); if (!wce) { fastlock_release(&cq->lock); return -FI_ENOMEM; } memset(wce, 0, sizeof(*wce)); memcpy(&wce->wc, &wc, sizeof wc); slist_insert_tail(&wce->entry, &cq->wcq); ret = -FI_EAVAIL; break; } cq->read_entry(&wc, i, buf); }
static void util_mr_free_entry(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { FI_DBG(cache->domain->prov, FI_LOG_MR, "free %p (len: %" PRIu64 ")\n", entry->iov.iov_base, entry->iov.iov_len); assert(!entry->cached); if (entry->subscribed) { ofi_monitor_unsubscribe(&entry->subscription); entry->subscribed = 0; } cache->delete_region(cache, entry); assert((cache->cached_cnt != 0) && (((ssize_t)cache->cached_size - (ssize_t)entry->iov.iov_len) >= 0)); cache->cached_cnt--; cache->cached_size -= entry->iov.iov_len; util_buf_release(cache->entry_pool, entry); }
int rxm_ep_prepost_buf(struct rxm_ep *rxm_ep) { struct rxm_rx_buf *rx_buf; int ret, i; for (i = 0; i < rxm_ep->rx_pool->chunk_cnt; i++) { rx_buf = util_buf_get(rxm_ep->rx_pool); rx_buf->ctx_type = RXM_RX_BUF; rx_buf->ep = rxm_ep; ret = rxm_ep_repost_buf(rx_buf); if (ret) { util_buf_release(rxm_ep->rx_pool, rx_buf); return ret; } slist_insert_tail(&rx_buf->entry, &rxm_ep->rx_buf_list); } return 0; }
static void rxm_ep_txrx_res_close(struct rxm_ep *rxm_ep) { struct slist_entry *entry; struct rxm_rx_buf *rx_buf; rxm_recv_queue_close(&rxm_ep->trecv_queue); rxm_recv_queue_close(&rxm_ep->recv_queue); if (rxm_ep->txe_fs) rxm_txe_fs_free(rxm_ep->txe_fs); while(!slist_empty(&rxm_ep->rx_buf_list)) { entry = slist_remove_head(&rxm_ep->rx_buf_list); rx_buf = container_of(entry, struct rxm_rx_buf, entry); util_buf_release(rxm_ep->rx_pool, rx_buf); } util_buf_pool_destroy(rxm_ep->rx_pool); util_buf_pool_destroy(rxm_ep->tx_pool); }
static int process_srx_entry(struct tcpx_xfer_entry *rx_entry) { int ret; ret = tcpx_recv_msg_data(rx_entry); if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret)) return ret; if (ret) { FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "msg recv Failed ret = %d\n", ret); tcpx_ep_shutdown_report(rx_entry->ep, &rx_entry->ep->util_ep.ep_fid.fid); } if ((ntohl(rx_entry->msg_hdr.hdr.flags) & OFI_DELIVERY_COMPLETE) && !ret) { if (tcpx_prepare_rx_entry_resp(rx_entry)) rx_entry->ep->cur_rx_proc_fn = tcpx_prepare_rx_entry_resp; return FI_SUCCESS; } tcpx_cq_report_completion(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret); /* release the shared entry */ if (rx_entry->ep->cur_rx_entry == rx_entry) { rx_entry->ep->cur_rx_entry = NULL; } fastlock_acquire(&rx_entry->ep->srx_ctx->lock); util_buf_release(rx_entry->ep->srx_ctx->buf_pool, rx_entry); fastlock_release(&rx_entry->ep->srx_ctx->lock); return FI_SUCCESS; }
static ssize_t fi_ibv_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, uint64_t flags) { struct fi_ibv_cq *cq; struct fi_ibv_wce *wce; struct slist_entry *slist_entry; cq = container_of(cq_fid, struct fi_ibv_cq, cq_fid); fastlock_acquire(&cq->lock); if (slist_empty(&cq->wcq)) goto err; wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry); if (!wce->wc.status) goto err; slist_entry = slist_remove_head(&cq->wcq); fastlock_release(&cq->lock); wce = container_of(slist_entry, struct fi_ibv_wce, entry); entry->op_context = (void *) (uintptr_t) wce->wc.wr_id; entry->flags = 0; entry->err = EIO; entry->prov_errno = wce->wc.status; memcpy(&entry->err_data, &wce->wc.vendor_err, sizeof(wce->wc.vendor_err)); util_buf_release(cq->domain->fab->wce_pool, wce); return sizeof(*entry); err: fastlock_release(&cq->lock); return -FI_EAGAIN; }
void rxd_tx_pkt_free(struct rxd_pkt_meta *pkt_meta) { util_buf_release(pkt_meta->ep->tx_pkt_pool, pkt_meta); }
static ssize_t fi_ibv_rdm_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_conn *conn = ep->av->addr_to_conn(ep, msg->addr); struct fi_ibv_rdm_rma_start_data start_data = { .ep_rdm = ep, .conn = conn, .context = msg->context, .flags = FI_RMA | FI_READ | (ep->tx_selective_completion ? (flags & FI_COMPLETION) : FI_COMPLETION), .data_len = (uint64_t)msg->msg_iov[0].iov_len, .rbuf = (uintptr_t)msg->rma_iov[0].addr, .lbuf = (uintptr_t)msg->msg_iov[0].iov_base, .rkey = (uint64_t)(uintptr_t)(msg->rma_iov[0].key), .lkey = (uint64_t)(uintptr_t)(msg->desc ? msg->desc[0] : NULL), .op_code = IBV_WR_RDMA_READ }; struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; struct fi_ibv_rdm_buf *rdm_buf = NULL; ssize_t ret = FI_SUCCESS; if(msg->iov_count != 1 || msg->rma_iov_count != 1) { assert(0); return -FI_EMSGSIZE; } ret = fi_ibv_rdm_ep_rma_preinit((void**)&start_data.lkey, &rdm_buf, msg->msg_iov[0].iov_len, conn, ep); if (ret) { return ret; } struct fi_ibv_rdm_request *request = util_buf_alloc(fi_ibv_rdm_request_pool); FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->state.err = FI_SUCCESS; request->minfo.is_tagged = 0; request->rmabuf = rdm_buf; fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); return fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_POST_READY, &post_ready_data); } static ssize_t fi_ibv_rdm_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); struct fi_rma_iov rma_iov = { .addr = addr, .len = 0, .key = key }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } return fi_ibv_rdm_ep_rma_readmsg(ep, &msg, (ep_rdm->tx_selective_completion ? 0ULL : FI_COMPLETION)); } static ssize_t fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { const struct iovec iov = { .iov_base = buf, .iov_len = len }; return fi_ibv_rdm_ep_rma_readv(ep_fid, &iov, &desc, 1, src_addr, addr, key, context); } static ssize_t fi_ibv_rdm_ep_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_conn *conn = ep->av->addr_to_conn(ep, msg->addr); struct fi_ibv_rdm_request *request = NULL; struct fi_ibv_rdm_buf *rdm_buf = NULL; ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep, .context = msg->context, .flags = FI_RMA | FI_WRITE | (ep->tx_selective_completion ? (flags & FI_COMPLETION) : FI_COMPLETION), .data_len = (uint64_t)msg->msg_iov[0].iov_len, .rbuf = msg->rma_iov[0].addr, .lbuf = (uintptr_t)msg->msg_iov[0].iov_base, .rkey = msg->rma_iov[0].key, .lkey = (uint64_t)(uintptr_t)(msg->desc ? msg->desc[0] : NULL), .op_code = IBV_WR_RDMA_WRITE }; if(msg->iov_count != 1 && msg->rma_iov_count != 1) { assert(0); return -FI_EMSGSIZE; } ret = fi_ibv_rdm_ep_rma_preinit((void**)&start_data.lkey, &rdm_buf, msg->msg_iov[0].iov_len, conn, ep); if (ret) { return ret; } request = util_buf_alloc(fi_ibv_rdm_request_pool); FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->state.err = FI_SUCCESS; request->minfo.is_tagged = 0; request->rmabuf = rdm_buf; fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; return fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_POST_READY, &post_ready_data); } static ssize_t fi_ibv_rdm_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = 0, .key = key }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_ibv_rdm_ep *ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); return fi_ibv_rdm_ep_rma_writemsg(ep_fid, &msg, (ep_rdm->tx_selective_completion ? 0ULL : FI_COMPLETION)); } static ssize_t fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { const struct iovec iov = { .iov_base = (void *)buf, .iov_len = len }; return fi_ibv_rdm_ep_rma_writev(ep_fid, &iov, &desc, 1, dest_addr, addr, key, context); } static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_conn *conn = ep_rdm->av->addr_to_conn(ep_rdm, dest_addr); struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep_rdm, .flags = 0, /* inject does not generate completion */ .data_len = (uint64_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint64_t)key, .lkey = 0 }; struct fi_ibv_rdm_request *request = util_buf_alloc(fi_ibv_rdm_request_pool); ssize_t ret; FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_RMA_INJECT; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->state.err = FI_SUCCESS; request->minfo.is_tagged = 0; ret = fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); switch (ret) { case FI_SUCCESS: return ret; case -FI_EAGAIN: break; default: ret = -errno; break; } FI_IBV_RDM_DBG_REQUEST("to_pool: ", request, FI_LOG_DEBUG); util_buf_release(fi_ibv_rdm_request_pool, request); fi_ibv_rdm_tagged_poll(ep_rdm); return ret; } static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_ibv_rdm_ep_rma_read, .readv = fi_ibv_rdm_ep_rma_readv, .readmsg = fi_ibv_rdm_ep_rma_readmsg, .write = fi_ibv_rdm_ep_rma_write, .writev = fi_ibv_rdm_ep_rma_writev, .writemsg = fi_ibv_rdm_ep_rma_writemsg, .inject = fi_ibv_rdm_ep_rma_inject_write, .writedata = fi_no_rma_writedata, .injectdata = fi_no_rma_injectdata, }; struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma() { return &fi_ibv_rdm_ep_rma_ops; }
static ssize_t fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) src_addr; void *raw_buf = NULL; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (raw_buf) { desc = (void*)(uintptr_t)conn->rma_mr->lkey; again = 0; } } if (again) { goto out_again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || RMA_RESOURCES_IS_BUSY(conn, ep)) { /* * TODO: Should be postponed queue flow for RMA be implemented? */ goto out_again; } struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->rmabuf = raw_buf; struct fi_ibv_rdm_rma_start_data start_data = { .ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid), .conn = (struct fi_ibv_rdm_tagged_conn *) src_addr, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_READ }; fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY, &post_ready_data); out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { if(msg->iov_count == 1 && msg->rma_iov_count == 1) { return fi_ibv_rdm_ep_rma_read(ep, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, msg->desc[0], msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key, msg->context); } assert(0); return -FI_EMSGSIZE; } static ssize_t fi_ibv_rdm_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = src_addr, .len = 0, .key = key }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; return fi_ibv_rdm_ep_rma_readmsg(ep, &msg, 0); } static ssize_t fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; void *raw_buf = NULL; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (raw_buf) { memcpy (raw_buf, buf, len); desc = (void*)(uintptr_t)conn->rma_mr->lkey; again = 0; } } if (again) { goto out_again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || SEND_RESOURCES_IS_BUSY(conn, ep)) { /* * TODO: Should be postponed queue flow for RMA be implemented? */ goto out_again; } struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->rmabuf = raw_buf; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_WRITE }; fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY, &post_ready_data); ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER; out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { if(msg->iov_count == 1 && msg->rma_iov_count == 1) { return fi_ibv_rdm_ep_rma_write(ep, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, msg->desc[0], msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key, msg->context); } assert(0); return -FI_EMSGSIZE; } static ssize_t fi_ibv_rdm_ep_rma_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = dest_addr, .len = 0, .key = key }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; return fi_ibv_rdm_ep_rma_writemsg(ep, &msg, 0); } static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; struct fi_ibv_rdm_tagged_request *request = NULL; int ret = FI_EP_RDM_HNDL_AGAIN; if (len >= ep_rdm->rndv_threshold) { return -FI_EMSGSIZE; } if (fi_ibv_rdm_check_connection(conn, ep_rdm) && !RMA_RESOURCES_IS_BUSY(conn, ep_rdm) && !conn->postponed_entry) { request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_RMA_INJECT; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep_rdm, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = 0 }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); } switch (ret) { case FI_EP_RDM_HNDL_SUCCESS: return ret; case FI_EP_RDM_HNDL_AGAIN: ret = -FI_EAGAIN; break; default: ret = -errno; break; } if (request) { FI_IBV_RDM_TAGGED_DBG_REQUEST("to_pool: ", request, FI_LOG_DEBUG); util_buf_release(fi_ibv_rdm_tagged_request_pool, request); } fi_ibv_rdm_tagged_poll(ep_rdm); return ret; } static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_ibv_rdm_ep_rma_read, .readv = fi_ibv_rdm_ep_rma_readv, .readmsg = fi_ibv_rdm_ep_rma_readmsg, .write = fi_ibv_rdm_ep_rma_write, .writev = fi_ibv_rdm_ep_rma_writev, .writemsg = fi_ibv_rdm_ep_rma_writemsg, .inject = fi_ibv_rdm_ep_rma_inject_write, .writedata = fi_no_rma_writedata, .injectdata = fi_no_rma_injectdata, }; struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma(struct fi_ibv_rdm_ep *ep) { return &fi_ibv_rdm_ep_rma_ops; }
// TODO handle all flags static ssize_t rxm_ep_send_common(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context, uint64_t data, uint64_t tag, uint64_t flags, int op) { struct rxm_ep *rxm_ep; struct rxm_conn *rxm_conn; struct rxm_tx_entry *tx_entry; struct rxm_pkt *pkt; struct fid_mr *mr; void *desc_tx_buf = NULL; struct rxm_rma_iov *rma_iov; int pkt_size = 0; int i, ret; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); if (ret) return ret; if (freestack_isempty(rxm_ep->txe_fs)) { FI_DBG(&rxm_prov, FI_LOG_CQ, "Exhaused tx_entry freestack\n"); return -FI_ENOMEM; } tx_entry = freestack_pop(rxm_ep->txe_fs); tx_entry->ctx_type = RXM_TX_ENTRY; tx_entry->ep = rxm_ep; tx_entry->context = context; tx_entry->flags = flags; if (rxm_ep->msg_info->mode & FI_LOCAL_MR) { pkt = util_buf_get_ex(rxm_ep->tx_pool, (void **)&mr); desc_tx_buf = fi_mr_desc(mr); } else { pkt = util_buf_get(rxm_ep->tx_pool); } assert(pkt); tx_entry->pkt = pkt; rxm_pkt_init(pkt); pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key; pkt->hdr.op = op; pkt->hdr.size = ofi_get_iov_len(iov, count); rxm_op_hdr_process_flags(&pkt->hdr, flags, data); if (op == ofi_op_tagged) pkt->hdr.tag = tag; if (pkt->hdr.size > RXM_TX_DATA_SIZE) { if (flags & FI_INJECT) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "inject size supported: %d, msg size: %d\n", rxm_tx_attr.inject_size, pkt->hdr.size); ret = -FI_EMSGSIZE; goto err; } tx_entry->msg_id = ofi_idx2key(&rxm_ep->tx_key_idx, rxm_txe_fs_index(rxm_ep->txe_fs, tx_entry)); pkt->ctrl_hdr.msg_id = tx_entry->msg_id; pkt->ctrl_hdr.type = ofi_ctrl_large_data; rma_iov = (struct rxm_rma_iov *)pkt->data; rma_iov->count = count; for (i = 0; i < count; i++) { rma_iov->iov[i].addr = rxm_ep->msg_info->domain_attr->mr_mode == FI_MR_SCALABLE ? 0 : (uintptr_t)iov->iov_base; rma_iov->iov[i].len = (uint64_t)iov->iov_len; rma_iov->iov[i].key = fi_mr_key(desc[i]); } pkt_size = sizeof(*pkt) + sizeof(*rma_iov) + sizeof(*rma_iov->iov) * count; FI_DBG(&rxm_prov, FI_LOG_CQ, "Sending large msg. msg_id: 0x%" PRIx64 "\n", tx_entry->msg_id); FI_DBG(&rxm_prov, FI_LOG_CQ, "tx_entry->state -> RXM_LMT_START\n"); tx_entry->state = RXM_LMT_START; } else { pkt->ctrl_hdr.type = ofi_ctrl_data; ofi_copy_iov_buf(iov, count, pkt->data, pkt->hdr.size, 0, OFI_COPY_IOV_TO_BUF); pkt_size = sizeof(*pkt) + pkt->hdr.size; } ret = fi_send(rxm_conn->msg_ep, pkt, pkt_size, desc_tx_buf, 0, tx_entry); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "fi_send for MSG provider failed\n"); goto err; } return 0; err: util_buf_release(rxm_ep->tx_pool, pkt); freestack_push(rxm_ep->txe_fs, tx_entry); return ret; }