static inline ssize_t rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags, rxm_rma_msg_fn rma_msg, uint64_t comp_flags) { struct rxm_rma_buf *rma_buf; struct fi_msg_rma msg_rma = *msg; struct rxm_conn *rxm_conn; void *mr_desc[RXM_IOV_LIMIT] = { 0 }; int ret; assert(msg->rma_iov_count <= rxm_ep->rxm_info->tx_attr->rma_iov_limit); ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; ofi_ep_lock_acquire(&rxm_ep->util_ep); rma_buf = rxm_rma_buf_alloc(rxm_ep); if (OFI_UNLIKELY(!rma_buf)) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from RMA buffer pool\n"); ret = -FI_ENOMEM; goto unlock; } rma_buf->app_context = msg->context; rma_buf->flags = flags; ret = rxm_ep_rma_reg_iov(rxm_ep, msg_rma.msg_iov, msg_rma.desc, mr_desc, msg_rma.iov_count, comp_flags & (FI_WRITE | FI_READ), rma_buf); if (OFI_UNLIKELY(ret)) goto release; msg_rma.desc = mr_desc; msg_rma.context = rma_buf; ret = rma_msg(rxm_conn->msg_ep, &msg_rma, flags); if (OFI_LIKELY(!ret)) goto unlock; if ((rxm_ep->msg_mr_local) && (!rxm_ep->rxm_mr_local)) rxm_ep_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count); release: ofi_buf_free(rma_buf); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; }
/* SMSG callback for AMO remote counter control message. */ int __smsg_amo_cntr(void *data, void *msg) { int ret = FI_SUCCESS; struct gnix_vc *vc = (struct gnix_vc *)data; struct gnix_smsg_amo_cntr_hdr *hdr = (struct gnix_smsg_amo_cntr_hdr *)msg; struct gnix_fid_ep *ep = vc->ep; gni_return_t status; if (hdr->flags & FI_REMOTE_WRITE && ep->rwrite_cntr) { ret = _gnix_cntr_inc(ep->rwrite_cntr); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_DATA, "_gnix_cntr_inc() failed: %d\n", ret); } if (hdr->flags & FI_REMOTE_READ && ep->rread_cntr) { ret = _gnix_cntr_inc(ep->rread_cntr); if (ret != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_DATA, "_gnix_cntr_inc() failed: %d\n", ret); } status = GNI_SmsgRelease(vc->gni_ep); if (OFI_UNLIKELY(status != GNI_RC_SUCCESS)) { GNIX_WARN(FI_LOG_EP_DATA, "GNI_SmsgRelease returned %s\n", gni_err_str[status]); ret = gnixu_to_fi_errno(status); } return ret; }
/* Must call with cq->lock held */ static inline int fi_ibv_poll_outstanding_cq(struct fi_ibv_ep *ep, struct fi_ibv_cq *cq) { struct fi_ibv_wce *wce; struct ibv_wc wc; ssize_t ret; ret = ibv_poll_cq(cq->cq, 1, &wc); if (ret <= 0) return ret; /* Handle WR entry when user doesn't request the completion */ if (wc.wr_id == VERBS_INJECT_FLAG) { /* To ensure the new iteration */ return 1; } if (wc.status != IBV_WC_WR_FLUSH_ERR) { ret = fi_ibv_wc_2_wce(cq, &wc, &wce); if (OFI_UNLIKELY(ret)) { ret = -FI_EAGAIN; goto fn; } slist_insert_tail(&wce->entry, &cq->wcq); } ret = 1; fn: return ret; }
static inline ssize_t rxm_ep_rma_reg_iov(struct rxm_ep *rxm_ep, const struct iovec *msg_iov, void **desc, void **desc_storage, size_t iov_count, uint64_t comp_flags, struct rxm_rma_buf *rma_buf) { size_t i; if (rxm_ep->msg_mr_local) { if (!rxm_ep->rxm_mr_local) { ssize_t ret = rxm_ep_msg_mr_regv(rxm_ep, msg_iov, iov_count, comp_flags & (FI_WRITE | FI_READ), rma_buf->mr.mr); if (OFI_UNLIKELY(ret)) return ret; for (i = 0; i < iov_count; i++) desc_storage[i] = fi_mr_desc(rma_buf->mr.mr[i]); rma_buf->mr.count = iov_count; } else { for (i = 0; i < iov_count; i++) desc_storage[i] = fi_mr_desc(desc[i]); } } return FI_SUCCESS; }
static inline int rxm_ep_send_atomic_req(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, struct rxm_tx_atomic_buf *tx_buf, uint64_t len) { int ret; /* Atomic request TX completion processing is performed when the * software generated atomic response message is received. */ tx_buf->hdr.state = RXM_ATOMIC_RESP_WAIT; if (len <= rxm_ep->inject_limit) ret = fi_inject(rxm_conn->msg_ep, &tx_buf->pkt, len, 0); else ret = fi_send(rxm_conn->msg_ep, &tx_buf->pkt, len, tx_buf->hdr.desc, 0, tx_buf); if (ret == -FI_EAGAIN) rxm_ep_do_progress(&rxm_ep->util_ep); if (OFI_LIKELY(!ret)) FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "sent atomic request: op: %" PRIu8 " msg_id: 0x%" PRIx64 "\n", tx_buf->pkt.hdr.op, tx_buf->pkt.ctrl_hdr.msg_id); else if (OFI_UNLIKELY(ret != -FI_EAGAIN)) FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "unable to send atomic " "request: op: %" PRIu8 " msg_id: 0x%" PRIx64 "\n", tx_buf->pkt.hdr.op, tx_buf->pkt.ctrl_hdr.msg_id); return ret; }
static int rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) { int ret; if (rx_buf->ep->util_ep.flags & OFI_CNTR_ENABLED) rxm_cntr_incerr(rx_buf->ep->util_ep.rx_cntr); FI_WARN(&rxm_prov, FI_LOG_CQ, "Message truncated: " "recv buf length: %zu message length: %" PRIu64 "\n", done_len, rx_buf->pkt.hdr.size); ret = ofi_cq_write_error_trunc(rx_buf->ep->util_ep.rx_cq, rx_buf->recv_entry->context, rx_buf->recv_entry->comp_flags | rxm_cq_get_rx_comp_flags(rx_buf), rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov[0].iov_base, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, rx_buf->pkt.hdr.size - done_len); if (OFI_UNLIKELY(ret)) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to write recv error CQ\n"); return ret; } return 0; }
int ofi_monitor_subscribe(struct ofi_notification_queue *nq, void *addr, size_t len, struct ofi_subscription *subscription) { int ret; FI_DBG(&core_prov, FI_LOG_MR, "subscribing addr=%p len=%zu subscription=%p nq=%p\n", addr, len, subscription, nq); /* Ensure the subscription is initialized before we can get events */ dlist_init(&subscription->entry); subscription->nq = nq; subscription->addr = addr; subscription->len = len; fastlock_acquire(&nq->lock); nq->refcnt++; fastlock_release(&nq->lock); ret = nq->monitor->subscribe(nq->monitor, addr, len, subscription); if (OFI_UNLIKELY(ret)) { FI_WARN(&core_prov, FI_LOG_MR, "Failed (ret = %d) to monitor addr=%p len=%zu", ret, addr, len); fastlock_acquire(&nq->lock); nq->refcnt--; fastlock_release(&nq->lock); } return ret; }
static ssize_t mrail_tsend_common(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, size_t len, fi_addr_t dest_addr, uint64_t tag, uint64_t data, void *context, uint64_t flags) { struct mrail_ep *mrail_ep = container_of(ep_fid, struct mrail_ep, util_ep.ep_fid.fid); struct mrail_peer_info *peer_info; struct iovec *iov_dest = alloca(sizeof(*iov_dest) * (count + 1)); struct mrail_tx_buf *tx_buf; uint32_t i = mrail_get_tx_rail(mrail_ep); struct fi_msg msg; ssize_t ret; peer_info = ofi_av_get_addr(mrail_ep->util_ep.av, (int) dest_addr); ofi_ep_lock_acquire(&mrail_ep->util_ep); tx_buf = mrail_get_tx_buf(mrail_ep, context, peer_info->seq_no++, ofi_op_tagged, flags | FI_TAGGED); if (OFI_UNLIKELY(!tx_buf)) { ret = -FI_ENOMEM; goto err1; } tx_buf->hdr.tag = tag; mrail_copy_iov_hdr(&tx_buf->hdr, iov_dest, iov, count); msg.msg_iov = iov_dest; msg.desc = desc; msg.iov_count = count + 1; msg.addr = dest_addr; msg.context = tx_buf; msg.data = data; if (len < mrail_ep->rails[i].info->tx_attr->inject_size) flags |= FI_INJECT; FI_DBG(&mrail_prov, FI_LOG_EP_DATA, "Posting tsend of length: %" PRIu64 " dest_addr: 0x%" PRIx64 " tag: 0x%" PRIx64 " seq: %d" " on rail: %d\n", len, dest_addr, tag, peer_info->seq_no - 1, i); ret = fi_sendmsg(mrail_ep->rails[i].ep, &msg, flags); if (ret) { FI_WARN(&mrail_prov, FI_LOG_EP_DATA, "Unable to fi_sendmsg on rail: %" PRIu32 "\n", i); goto err2; } else if (!(flags & FI_COMPLETION)) { ofi_ep_tx_cntr_inc(&mrail_ep->util_ep); } ofi_ep_lock_release(&mrail_ep->util_ep); return ret; err2: util_buf_release(mrail_ep->tx_buf_pool, tx_buf); err1: peer_info->seq_no--; ofi_ep_lock_release(&mrail_ep->util_ep); return ret; }
static ssize_t fi_ibv_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) { struct fi_ibv_cq *cq; struct fi_ibv_wce *wce; struct slist_entry *entry; struct ibv_wc wc; ssize_t ret = 0, i; cq = container_of(cq_fid, struct fi_ibv_cq, util_cq.cq_fid); cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); for (i = 0; i < count; i++) { if (!slist_empty(&cq->wcq)) { wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry); if (wce->wc.status) { ret = -FI_EAVAIL; break; } entry = slist_remove_head(&cq->wcq); wce = container_of(entry, struct fi_ibv_wce, entry); cq->read_entry(&wce->wc, (char *)buf + i * cq->entry_size); util_buf_release(cq->wce_pool, wce); continue; } ret = fi_ibv_poll_cq(cq, &wc); if (ret <= 0) break; /* Insert error entry into wcq */ if (OFI_UNLIKELY(wc.status)) { if (wc.status == IBV_WC_WR_FLUSH_ERR) { /* Handle case when remote side destroys * the connection, but local side isn't aware * about that yet */ VERBS_DBG(FI_LOG_CQ, "Ignoring WC with status " "IBV_WC_WR_FLUSH_ERR(%d)\n", wc.status); i--; continue; } wce = util_buf_alloc(cq->wce_pool); if (!wce) { cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); return -FI_ENOMEM; } memset(wce, 0, sizeof(*wce)); memcpy(&wce->wc, &wc, sizeof wc); slist_insert_tail(&wce->entry, &cq->wcq); ret = -FI_EAVAIL; break; } cq->read_entry(&wc, (char *)buf + i * cq->entry_size); }
static int util_mr_cache_create(struct ofi_mr_cache *cache, const struct iovec *iov, uint64_t access, struct ofi_mr_entry **entry) { int ret; FI_DBG(cache->domain->prov, FI_LOG_MR, "create %p (len: %" PRIu64 ")\n", iov->iov_base, iov->iov_len); util_mr_cache_process_events(cache); *entry = util_buf_alloc(cache->entry_pool); if (OFI_UNLIKELY(!*entry)) return -FI_ENOMEM; (*entry)->iov = *iov; (*entry)->use_cnt = 1; ret = cache->add_region(cache, *entry); if (ret) { while (ret && ofi_mr_cache_flush(cache)) { ret = cache->add_region(cache, *entry); } if (ret) { assert(!ofi_mr_cache_flush(cache)); util_buf_release(cache->entry_pool, *entry); return ret; } } cache->cached_size += iov->iov_len; if ((++cache->cached_cnt > cache->max_cached_cnt) || (cache->cached_size > cache->max_cached_size)) { (*entry)->cached = 0; } else { if (cache->mr_storage.insert(&cache->mr_storage, &(*entry)->iov, *entry)) { ret = -FI_ENOMEM; goto err; } (*entry)->cached = 1; ret = ofi_monitor_subscribe(&cache->nq, iov->iov_base, iov->iov_len, &(*entry)->subscription); if (ret) goto err; (*entry)->subscribed = 1; } return 0; err: util_mr_free_entry(cache, *entry); return ret; }
static struct ofi_mr_entry *ofi_mr_rbt_storage_find(struct ofi_mr_storage *storage, const struct iovec *key) { struct ofi_mr_entry *entry; RbtIterator iter = rbtFind((RbtHandle)storage->storage, (void *)key); if (OFI_UNLIKELY(!iter)) return iter; rbtKeyValue(storage->storage, iter, (void *)&key, (void *)&entry); return entry; }
static ssize_t rxm_ep_generic_atomic_writemsg(struct rxm_ep *rxm_ep, const struct fi_msg_atomic *msg, uint64_t flags) { int ret; struct rxm_conn *rxm_conn; ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; return rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, NULL, NULL, 0, NULL, NULL, 0, ofi_op_atomic, flags); }
static inline int rxm_cq_tx_comp_write(struct rxm_ep *rxm_ep, uint64_t comp_flags, void *app_context, uint64_t flags) { if (flags & FI_COMPLETION) { int ret = ofi_cq_write(rxm_ep->util_ep.tx_cq, app_context, comp_flags, 0, NULL, 0, 0); if (OFI_UNLIKELY(ret)) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to report completion\n"); return ret; } rxm_cq_log_comp(comp_flags); } return 0; }
int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len) { int ret; FI_DBG(&core_prov, FI_LOG_MR, "subscribing addr=%p len=%zu\n", addr, len); ret = monitor->subscribe(monitor, addr, len); if (OFI_UNLIKELY(ret)) { FI_WARN(&core_prov, FI_LOG_MR, "Failed (ret = %d) to monitor addr=%p len=%zu\n", ret, addr, len); } return ret; }
static void client_recv_connresp(struct util_wait *wait, struct tcpx_cm_context *cm_ctx) { struct fi_eq_err_entry err_entry = { 0 }; struct tcpx_ep *ep; ssize_t ret; assert(cm_ctx->fid->fclass == FI_CLASS_EP); ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid); ret = ofi_wait_fd_del(wait, ep->conn_fd); if (ret) { FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Could not remove fd from wait\n"); goto err; } ret = proc_conn_resp(cm_ctx, ep); if (ret) goto err; FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Received Accept from server\n"); free(cm_ctx); return; err: err_entry.fid = cm_ctx->fid; err_entry.context = cm_ctx->fid->context; err_entry.err = -ret; if (cm_ctx->cm_data_sz) { err_entry.err_data = calloc(1, cm_ctx->cm_data_sz); if (OFI_LIKELY(err_entry.err_data != NULL)) { memcpy(err_entry.err_data, cm_ctx->cm_data, cm_ctx->cm_data_sz); err_entry.err_data_size = cm_ctx->cm_data_sz; } } FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "fi_eq_write the conn refused %"PRId64"\n", ret); free(cm_ctx); /* `err_entry.err_data` must live until it is passed to user */ ret = fi_eq_write(&ep->util_ep.eq->eq_fid, FI_NOTIFY, &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR); if (OFI_UNLIKELY(ret < 0)) { free(err_entry.err_data); } }
static struct mrail_tx_buf *mrail_get_tx_buf(struct mrail_ep *mrail_ep, void *context, uint32_t seq, uint8_t op, uint64_t flags) { struct mrail_tx_buf *tx_buf = util_buf_alloc(mrail_ep->tx_buf_pool); if (OFI_UNLIKELY(!tx_buf)) return NULL; assert(tx_buf->ep == mrail_ep); assert(tx_buf->hdr.version == MRAIL_HDR_VERSION); tx_buf->context = context; tx_buf->flags = flags; tx_buf->hdr.op = op; tx_buf->hdr.seq = htonl(seq); return tx_buf; }
static ssize_t rxm_ep_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct util_cmap_handle *handle; struct rxm_conn *rxm_conn; struct rxm_ep *rxm_ep; int ret; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ret = ofi_cmap_get_handle(rxm_ep->util_ep.cmap, msg->addr, &handle); if (OFI_UNLIKELY(ret)) return ret; rxm_conn = container_of(handle, struct rxm_conn, handle); return rxm_ep_rma_common(rxm_conn->msg_ep, rxm_ep, msg, flags, fi_readmsg, FI_READ); }
struct mrail_recv * mrail_match_recv_handle_unexp(struct mrail_recv_queue *recv_queue, uint64_t tag, uint64_t addr, char *data, size_t len, void *context) { struct dlist_entry *entry; struct mrail_unexp_msg_entry *unexp_msg_entry; struct mrail_match_attr match_attr = { .tag = tag, .addr = addr, }; entry = dlist_remove_first_match(&recv_queue->recv_list, recv_queue->match_recv, &match_attr); if (OFI_UNLIKELY(!entry)) { unexp_msg_entry = recv_queue->get_unexp_msg_entry(recv_queue, context); if (!unexp_msg_entry) { FI_WARN(recv_queue->prov, FI_LOG_CQ, "Unable to get unexp_msg_entry!"); assert(0); return NULL; } unexp_msg_entry->addr = addr; unexp_msg_entry->tag = tag; unexp_msg_entry->context = context; memcpy(unexp_msg_entry->data, data, len); FI_DBG(recv_queue->prov, FI_LOG_CQ, "No matching recv found for" " incoming msg with addr: 0x%" PRIx64 " tag: 0x%" PRIx64 "\n", unexp_msg_entry->addr, unexp_msg_entry->tag); FI_DBG(recv_queue->prov, FI_LOG_CQ, "Enqueueing unexp_msg_entry to " "unexpected msg list\n"); dlist_insert_tail(&unexp_msg_entry->entry, &recv_queue->unexp_msg_list); return NULL; } return container_of(entry, struct mrail_recv, entry); }
static int read_cm_data(SOCKET fd, struct tcpx_cm_context *cm_ctx, struct ofi_ctrl_hdr *hdr) { cm_ctx->cm_data_sz = ntohs(hdr->seg_size); if (cm_ctx->cm_data_sz) { size_t data_sz = MIN(cm_ctx->cm_data_sz, TCPX_MAX_CM_DATA_SIZE); ssize_t ret = ofi_recv_socket(fd, cm_ctx->cm_data, data_sz, MSG_WAITALL); if ((size_t) ret != data_sz) return -FI_EIO; cm_ctx->cm_data_sz = data_sz; if (OFI_UNLIKELY(cm_ctx->cm_data_sz > TCPX_MAX_CM_DATA_SIZE)) { discard_cm_data(fd, cm_ctx->cm_data_sz - TCPX_MAX_CM_DATA_SIZE); } } return FI_SUCCESS; }
static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) { uint64_t flags; char *data; if (rx_buf->pkt.ctrl_hdr.type == ofi_ctrl_seg_data && rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) != RXM_SAR_SEG_FIRST) { dlist_insert_tail(&rx_buf->unexp_msg.entry, &rx_buf->conn->sar_deferred_rx_msg_list); rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->msg_ep, 1); if (OFI_UNLIKELY(!rx_buf)) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "ran out of buffers from RX buffer pool\n"); return -FI_ENOMEM; } dlist_insert_tail(&rx_buf->repost_entry, &rx_buf->ep->repost_ready_list); return 0; } flags = rxm_cq_get_rx_comp_and_op_flags(rx_buf); if (rx_buf->pkt.ctrl_hdr.type != ofi_ctrl_data) flags |= FI_MORE; if (rx_buf->pkt.ctrl_hdr.type == ofi_ctrl_large_data) data = rxm_pkt_rndv_data(&rx_buf->pkt); else data = rx_buf->pkt.data; FI_DBG(&rxm_prov, FI_LOG_CQ, "writing buffered recv completion: " "length: %" PRIu64 "\n", rx_buf->pkt.hdr.size); rx_buf->recv_context.ep = &rx_buf->ep->util_ep.ep_fid; return rxm_cq_write_recv_comp(rx_buf, &rx_buf->recv_context, flags, rx_buf->pkt.hdr.size, data); }
static ssize_t rxm_ep_atomic_writev(struct fid_ep *ep_fid, const struct fi_ioc *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_rma_ioc rma_iov = { .addr = addr, .count = ofi_total_ioc_cnt(iov, count), .key = key, }; struct fi_msg_atomic msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .datatype = datatype, .op = op, .context = context, .data = 0, }; return rxm_ep_generic_atomic_writemsg(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { const struct fi_ioc iov = { .addr = (void *) buf, .count = count, }; return rxm_ep_atomic_writev(ep_fid, &iov, &desc, 1, dest_addr, addr, key, datatype, op, context); } static ssize_t rxm_ep_atomic_inject(struct fid_ep *ep_fid, const void *buf, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_ioc msg_iov = { .addr = (void *) buf, .count = count, }; struct fi_rma_ioc rma_iov = { .addr = addr, .count = count, .key = key, }; struct fi_msg_atomic msg = { .msg_iov = &msg_iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .datatype = datatype, .op = op, .context = NULL, .data = 0, }; return rxm_ep_generic_atomic_writemsg(rxm_ep, &msg, FI_INJECT); } static ssize_t rxm_ep_generic_atomic_readwritemsg(struct rxm_ep *rxm_ep, const struct fi_msg_atomic *msg, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { int ret; struct rxm_conn *rxm_conn; ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; return rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, NULL, NULL, 0, resultv, result_desc, result_count, ofi_op_atomic_fetch, flags); } static ssize_t rxm_ep_atomic_readwritemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_atomic_readwritemsg(rxm_ep, msg, resultv, result_desc, result_count, flags | rxm_ep->util_ep.tx_msg_flags); } static ssize_t rxm_ep_atomic_readwritev(struct fid_ep *ep_fid, const struct fi_ioc *iov, void **desc, size_t count, struct fi_ioc *resultv, void **result_desc, size_t result_count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_rma_ioc rma_iov = { .addr = addr, .count = ofi_total_ioc_cnt(iov, count), .key = key, }; struct fi_msg_atomic msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .datatype = datatype, .op = op, .context = context, .data = 0, }; return rxm_ep_generic_atomic_readwritemsg(rxm_ep, &msg, resultv, result_desc, result_count, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct fi_ioc iov = { .addr = (op == FI_ATOMIC_READ) ? NULL : (void *) buf, .count = count, }; struct fi_ioc result_iov = { .addr = result, .count = count, }; if (!buf && op != FI_ATOMIC_READ) return -FI_EINVAL; return rxm_ep_atomic_readwritev(ep_fid, &iov, &desc, 1, &result_iov, &result_desc, 1, dest_addr, addr, key, datatype, op, context); } static ssize_t rxm_ep_generic_atomic_compwritemsg(struct rxm_ep *rxm_ep, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { int ret; struct rxm_conn *rxm_conn; ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; return rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, comparev, compare_desc, compare_count, resultv, result_desc, result_count, ofi_op_atomic_compare, flags); } static ssize_t rxm_ep_atomic_compwritemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_atomic_compwritemsg(rxm_ep, msg, comparev, compare_desc, compare_count, resultv, result_desc, result_count, flags | rxm_ep->util_ep.tx_msg_flags); } static ssize_t rxm_ep_atomic_compwritev(struct fid_ep *ep_fid, const struct fi_ioc *iov, void **desc, size_t count, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, struct fi_ioc *resultv, void **result_desc, size_t result_count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_rma_ioc rma_iov = { .addr = addr, .count = ofi_total_ioc_cnt(iov, count), .key = key, }; struct fi_msg_atomic msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .datatype = datatype, .op = op, .context = context, .data = 0, }; return rxm_ep_generic_atomic_compwritemsg(rxm_ep, &msg, comparev, compare_desc, compare_count, resultv, result_desc, result_count, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { struct fi_ioc iov = { .addr = (void *) buf, .count = count, }; struct fi_ioc resultv = { .addr = result, .count = count, }; struct fi_ioc comparev = { .addr = (void *) compare, .count = count, }; return rxm_ep_atomic_compwritev(ep_fid, &iov, &desc, 1, &comparev, &compare_desc, 1, &resultv, &result_desc, 1, dest_addr, addr, key, datatype, op, context); } int rxm_ep_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags) { struct rxm_domain *rxm_domain = container_of(domain, struct rxm_domain, util_domain.domain_fid); size_t tot_size; int ret; if (flags & FI_TAGGED) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "tagged atomic op not supported\n"); return -FI_EINVAL; } ret = ofi_atomic_valid(&rxm_prov, datatype, op, flags); if (ret || !attr) return ret; tot_size = flags & FI_COMPARE_ATOMIC ? rxm_domain->max_atomic_size / 2 : rxm_domain->max_atomic_size; attr->size = ofi_datatype_size(datatype); attr->count = tot_size / attr->size; return FI_SUCCESS; } static int rxm_ep_atomic_valid(struct fid_ep *ep_fid, enum fi_datatype datatype, enum fi_op op, size_t *count) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid); struct fi_atomic_attr attr; int ret; ret = rxm_ep_query_atomic(&rxm_ep->util_ep.domain->domain_fid, datatype, op, &attr, 0); if (!ret) *count = attr.count; return ret; } static int rxm_ep_atomic_fetch_valid(struct fid_ep *ep_fid, enum fi_datatype datatype, enum fi_op op, size_t *count) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid); struct fi_atomic_attr attr; int ret; ret = rxm_ep_query_atomic(&rxm_ep->util_ep.domain->domain_fid, datatype, op, &attr, FI_FETCH_ATOMIC); if (!ret) *count = attr.count; return ret; } static int rxm_ep_atomic_cswap_valid(struct fid_ep *ep_fid, enum fi_datatype datatype, enum fi_op op, size_t *count) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid); struct fi_atomic_attr attr; int ret; ret = rxm_ep_query_atomic(&rxm_ep->util_ep.domain->domain_fid, datatype, op, &attr, FI_COMPARE_ATOMIC); if (!ret) *count = attr.count; return ret; } struct fi_ops_atomic rxm_ops_atomic = { .size = sizeof(struct fi_ops_atomic), .write = rxm_ep_atomic_write, .writev = rxm_ep_atomic_writev, .writemsg = rxm_ep_atomic_writemsg, .inject = rxm_ep_atomic_inject, .readwrite = rxm_ep_atomic_readwrite, .readwritev = rxm_ep_atomic_readwritev, .readwritemsg = rxm_ep_atomic_readwritemsg, .compwrite = rxm_ep_atomic_compwrite, .compwritev = rxm_ep_atomic_compwritev, .compwritemsg = rxm_ep_atomic_compwritemsg, .writevalid = rxm_ep_atomic_valid, .readwritevalid = rxm_ep_atomic_fetch_valid, .compwritevalid = rxm_ep_atomic_cswap_valid, };
static ssize_t rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_iov_count, struct fi_ioc *resultv, void **result_desc, size_t result_iov_count, uint32_t op, uint64_t flags) { struct rxm_tx_atomic_buf *tx_buf; struct rxm_atomic_hdr *atomic_hdr; struct iovec buf_iov[RXM_IOV_LIMIT]; struct iovec cmp_iov[RXM_IOV_LIMIT]; size_t datatype_sz = ofi_datatype_size(msg->datatype); size_t buf_len = 0; size_t cmp_len = 0; size_t tot_len; ssize_t ret; assert(msg->iov_count <= RXM_IOV_LIMIT && msg->rma_iov_count <= RXM_IOV_LIMIT); if (flags & FI_REMOTE_CQ_DATA) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "atomic with remote CQ data not supported\n"); return -FI_EINVAL; } if (msg->op != FI_ATOMIC_READ) { assert(msg->msg_iov); ofi_ioc_to_iov(msg->msg_iov, buf_iov, msg->iov_count, datatype_sz); buf_len = ofi_total_iov_len(buf_iov, msg->iov_count); } if (op == ofi_op_atomic_compare) { assert(comparev); ofi_ioc_to_iov(comparev, cmp_iov, compare_iov_count, datatype_sz); cmp_len = ofi_total_iov_len(cmp_iov, compare_iov_count); assert(buf_len == cmp_len); } tot_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr) + sizeof(struct rxm_pkt); if (tot_len > rxm_eager_limit) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "atomic data too large %zu\n", tot_len); return -FI_EINVAL; } ofi_ep_lock_acquire(&rxm_ep->util_ep); tx_buf = (struct rxm_tx_atomic_buf *) rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC); if (OFI_UNLIKELY(!tx_buf)) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from Atomic buffer pool\n"); ret = -FI_EAGAIN; goto unlock; } rxm_ep_format_atomic_pkt_hdr(rxm_conn, tx_buf, tot_len, op, msg->datatype, msg->op, flags, msg->data, msg->rma_iov, msg->rma_iov_count); tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf); tx_buf->app_context = msg->context; atomic_hdr = (struct rxm_atomic_hdr *) tx_buf->pkt.data; ofi_copy_from_iov(atomic_hdr->data, buf_len, buf_iov, msg->iov_count, 0); if (cmp_len) ofi_copy_from_iov(atomic_hdr->data + buf_len, cmp_len, cmp_iov, compare_iov_count, 0); tx_buf->result_iov_count = result_iov_count; if (resultv) ofi_ioc_to_iov(resultv, tx_buf->result_iov, result_iov_count, datatype_sz); ret = rxm_ep_send_atomic_req(rxm_ep, rxm_conn, tx_buf, tot_len); if (ret) ofi_buf_free(tx_buf); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; }
ssize_t psmx2_recv_generic(struct fid_ep *ep, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context, uint64_t flags) { struct psmx2_fid_ep *ep_priv; struct psmx2_fid_av *av; psm2_epaddr_t psm2_epaddr; psm2_mq_req_t psm2_req; psm2_mq_tag_t psm2_tag, psm2_tagsel; struct fi_context *fi_context; int recv_flag = 0; int err; int enable_completion; ep_priv = container_of(ep, struct psmx2_fid_ep, ep); if (flags & FI_TRIGGER) return psmx2_trigger_queue_recv(ep, buf, len, desc, src_addr, context, flags); if ((ep_priv->caps & FI_DIRECTED_RECV) && src_addr != FI_ADDR_UNSPEC) { av = ep_priv->av; assert(av); psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->rx, src_addr, av->type); } else { psm2_epaddr = 0; } PSMX2_SET_TAG(psm2_tag, 0ULL, 0, PSMX2_TYPE_MSG); PSMX2_SET_MASK(psm2_tagsel, PSMX2_MATCH_NONE, PSMX2_TYPE_MASK); enable_completion = !ep_priv->recv_selective_completion || (flags & FI_COMPLETION); if (enable_completion) { assert(context); fi_context = context; if (flags & FI_MULTI_RECV) { struct psmx2_multi_recv *req; req = calloc(1, sizeof(*req)); if (!req) return -FI_ENOMEM; req->src_addr = psm2_epaddr; req->tag = psm2_tag; req->tagsel = psm2_tagsel; req->flag = recv_flag; req->buf = buf; req->len = len; req->offset = 0; req->min_buf_size = ep_priv->min_multi_recv; req->context = fi_context; PSMX2_CTXT_TYPE(fi_context) = PSMX2_MULTI_RECV_CONTEXT; PSMX2_CTXT_USER(fi_context) = req; if (len > PSMX2_MAX_MSG_SIZE) len = PSMX2_MAX_MSG_SIZE; } else { PSMX2_CTXT_TYPE(fi_context) = PSMX2_RECV_CONTEXT; PSMX2_CTXT_USER(fi_context) = buf; } PSMX2_CTXT_EP(fi_context) = ep_priv; PSMX2_CTXT_SIZE(fi_context) = len; } else { PSMX2_EP_GET_OP_CONTEXT(ep_priv, fi_context); #if !PSMX2_USE_REQ_CONTEXT PSMX2_CTXT_TYPE(fi_context) = PSMX2_NOCOMP_RECV_CONTEXT; PSMX2_CTXT_EP(fi_context) = ep_priv; PSMX2_CTXT_USER(fi_context) = buf; PSMX2_CTXT_SIZE(fi_context) = len; #endif } err = psm2_mq_irecv2(ep_priv->rx->psm2_mq, psm2_epaddr, &psm2_tag, &psm2_tagsel, recv_flag, buf, len, (void *)fi_context, &psm2_req); if (OFI_UNLIKELY(err != PSM2_OK)) return psmx2_errno(err); if (enable_completion) { PSMX2_CTXT_REQ(fi_context) = psm2_req; } else { #if PSMX2_USE_REQ_CONTEXT PSMX2_REQ_GET_OP_CONTEXT(psm2_req, fi_context); PSMX2_CTXT_TYPE(fi_context) = PSMX2_NOCOMP_RECV_CONTEXT; PSMX2_CTXT_EP(fi_context) = ep_priv; PSMX2_CTXT_USER(fi_context) = buf; PSMX2_CTXT_SIZE(fi_context) = len; #endif } return 0; }
static ssize_t rxm_ep_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_rma_iov rma_iov = { .addr = addr, .len = ofi_total_iov_len(iov, count), .key = key, }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep), fi_readmsg, FI_READ); } static ssize_t rxm_ep_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep), fi_readmsg, FI_READ); } static inline void rxm_ep_format_rma_msg(struct rxm_rma_buf *rma_buf, const struct fi_msg_rma *orig_msg, struct iovec *rxm_iov, struct fi_msg_rma *rxm_msg) { rxm_msg->context = rma_buf; rxm_msg->addr = orig_msg->addr; rxm_msg->data = orig_msg->data; ofi_copy_from_iov(rma_buf->pkt.data, rma_buf->pkt.hdr.size, orig_msg->msg_iov, orig_msg->iov_count, 0); rxm_iov->iov_base = &rma_buf->pkt.data; rxm_iov->iov_len = rma_buf->pkt.hdr.size; rxm_msg->msg_iov = rxm_iov; rxm_msg->desc = &rma_buf->hdr.desc; rxm_msg->iov_count = 1; rxm_msg->rma_iov = orig_msg->rma_iov; rxm_msg->rma_iov_count = orig_msg->rma_iov_count; } static inline ssize_t rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, size_t total_size, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_rma_buf *rma_buf; ssize_t ret; struct iovec rxm_msg_iov = { 0 }; struct fi_msg_rma rxm_rma_msg = { 0 }; assert(msg->rma_iov_count <= rxm_ep->rxm_info->tx_attr->rma_iov_limit); ofi_ep_lock_acquire(&rxm_ep->util_ep); rma_buf = rxm_rma_buf_alloc(rxm_ep); if (OFI_UNLIKELY(!rma_buf)) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from RMA buffer pool\n"); ret = -FI_ENOMEM; goto unlock; } rma_buf->pkt.hdr.size = total_size; rma_buf->app_context = msg->context; rma_buf->flags = flags; rxm_ep_format_rma_msg(rma_buf, msg, &rxm_msg_iov, &rxm_rma_msg); flags = (flags & ~FI_INJECT) | FI_COMPLETION; ret = fi_writemsg(rxm_conn->msg_ep, &rxm_rma_msg, flags); if (OFI_UNLIKELY(ret)) { if (ret == -FI_EAGAIN) rxm_ep_do_progress(&rxm_ep->util_ep); ofi_buf_free(rma_buf); } unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; } static inline ssize_t rxm_ep_rma_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, uint64_t flags) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = NULL, .data = data, }; return rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, len, &msg, flags); } static inline ssize_t rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_conn *rxm_conn; size_t total_size = ofi_total_iov_len(msg->msg_iov, msg->iov_count); ssize_t ret; assert(total_size <= rxm_ep->rxm_info->tx_attr->inject_size); ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; if ((total_size <= rxm_ep->msg_info->tx_attr->inject_size) && !(flags & FI_COMPLETION) && (msg->iov_count == 1) && (msg->rma_iov_count == 1)) { if (flags & FI_REMOTE_CQ_DATA) { ret = fi_inject_writedata(rxm_conn->msg_ep, msg->msg_iov->iov_base, msg->msg_iov->iov_len, msg->data, msg->addr, msg->rma_iov->addr, msg->rma_iov->key); } else { ret = fi_inject_write(rxm_conn->msg_ep, msg->msg_iov->iov_base, msg->msg_iov->iov_len, msg->addr, msg->rma_iov->addr, msg->rma_iov->key); } if (OFI_LIKELY(!ret)) { ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); } else { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_write* for MSG provider failed with ret - %" PRId64"\n", ret); if (OFI_LIKELY(ret == -FI_EAGAIN)) rxm_ep_progress(&rxm_ep->util_ep); } return ret; } else { return rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, total_size, msg, flags); } } static inline ssize_t rxm_ep_generic_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); if (flags & FI_INJECT) return rxm_ep_rma_inject_common(rxm_ep, msg, flags); else return rxm_ep_rma_common(rxm_ep, msg, flags, fi_writemsg, FI_WRITE); } static inline ssize_t rxm_ep_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_writemsg(ep_fid, msg, flags | rxm_ep->util_ep.tx_msg_flags); } static ssize_t rxm_ep_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = ofi_total_iov_len(iov, count), .key = key, }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = data, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep) | FI_REMOTE_CQ_DATA); } static ssize_t rxm_ep_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { ssize_t ret; struct rxm_conn *rxm_conn; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; if (len <= rxm_ep->msg_info->tx_attr->inject_size) { ret = fi_inject_write(rxm_conn->msg_ep, buf, len, dest_addr, addr, key); if (OFI_LIKELY(!ret)) { ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); } else { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_write for MSG provider failed with ret - %" PRId64"\n", ret); if (OFI_LIKELY(ret == -FI_EAGAIN)) rxm_ep_progress(&rxm_ep->util_ep); } return ret; } else { return rxm_ep_rma_emulate_inject(rxm_ep, rxm_conn, buf, len, 0, dest_addr, addr, key, FI_INJECT); } } static ssize_t rxm_ep_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { ssize_t ret; struct rxm_conn *rxm_conn; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; if (len <= rxm_ep->msg_info->tx_attr->inject_size) { ret = fi_inject_writedata(rxm_conn->msg_ep, buf, len, data, dest_addr, addr, key); if (OFI_LIKELY(!ret)) { ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); } else { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_writedata for MSG provider failed with ret - %" PRId64"\n", ret); if (OFI_LIKELY(ret == -FI_EAGAIN)) rxm_ep_progress(&rxm_ep->util_ep); } return ret; } else { return rxm_ep_rma_emulate_inject(rxm_ep, rxm_conn, buf, len, data, dest_addr, addr, key, FI_REMOTE_CQ_DATA | FI_INJECT); } } struct fi_ops_rma rxm_ops_rma = { .size = sizeof (struct fi_ops_rma), .read = rxm_ep_read, .readv = rxm_ep_readv, .readmsg = rxm_ep_readmsg, .write = rxm_ep_write, .writev = rxm_ep_writev, .writemsg = rxm_ep_writemsg, .inject = rxm_ep_inject_write, .writedata = rxm_ep_writedata, .injectdata = rxm_ep_inject_writedata, };
static inline struct efa_conn *efa_av_tbl_idx_to_conn(struct efa_av *av, fi_addr_t addr) { if (OFI_UNLIKELY(addr == FI_ADDR_UNSPEC)) return NULL; return av->conn_table[addr]; }
static ssize_t rxm_ep_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = ofi_total_iov_len(iov, count), .key = key, }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_readmsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_readmsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_rma_inject(struct fid_ep *msg_ep, struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_tx_entry *tx_entry; struct rxm_tx_buf *tx_buf; struct fi_msg_rma msg_rma; struct iovec iov; size_t size; ssize_t ret; size = ofi_total_iov_len(msg->msg_iov, msg->iov_count); if (size > rxm_ep->rxm_info->tx_attr->inject_size) return -FI_EMSGSIZE; /* Use fi_inject_write instead of fi_writemsg since the latter generates * completion by default */ if (size <= rxm_ep->msg_info->tx_attr->inject_size && !(flags & FI_COMPLETION)) { if (flags & FI_REMOTE_CQ_DATA) return fi_inject_writedata(msg_ep, msg->msg_iov->iov_base, msg->msg_iov->iov_len, msg->data, msg->addr, msg->rma_iov->addr, msg->rma_iov->key); else return fi_inject_write(msg_ep, msg->msg_iov->iov_base, msg->msg_iov->iov_len, msg->addr, msg->rma_iov->addr, msg->rma_iov->key); } tx_buf = rxm_tx_buf_get(rxm_ep, RXM_BUF_POOL_TX_MSG); if (!tx_buf) { FI_WARN(&rxm_prov, FI_LOG_CQ, "TX queue full!\n"); rxm_ep_progress_multi(&rxm_ep->util_ep); return -FI_EAGAIN; } tx_entry = rxm_tx_entry_get(&rxm_ep->send_queue); if (!tx_entry) { rxm_ep_progress_multi(&rxm_ep->util_ep); ret = -FI_EAGAIN; goto err1; } tx_entry->state = RXM_TX; tx_entry->flags = flags; tx_entry->comp_flags = FI_RMA | FI_WRITE; tx_entry->tx_buf = tx_buf; ofi_copy_from_iov(tx_buf->pkt.data, size, msg->msg_iov, msg->iov_count, 0); iov.iov_base = &tx_buf->pkt.data; iov.iov_len = size; msg_rma.msg_iov = &iov; msg_rma.desc = &tx_buf->hdr.desc; msg_rma.iov_count = 1; msg_rma.addr = msg->addr; msg_rma.rma_iov = msg->rma_iov; msg_rma.rma_iov_count = msg->rma_iov_count; msg_rma.context = tx_entry; msg_rma.data = msg->data; flags = (flags & ~FI_INJECT) | FI_COMPLETION; ret = fi_writemsg(msg_ep, &msg_rma, flags); if (ret) { if (ret == -FI_EAGAIN) rxm_ep_progress_multi(&rxm_ep->util_ep); goto err2; } return 0; err2: rxm_tx_entry_release(&rxm_ep->send_queue, tx_entry); err1: rxm_tx_buf_release(rxm_ep, tx_buf); return ret; } static ssize_t rxm_ep_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct util_cmap_handle *handle; struct rxm_conn *rxm_conn; struct rxm_ep *rxm_ep; int ret; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ret = ofi_cmap_get_handle(rxm_ep->util_ep.cmap, msg->addr, &handle); if (OFI_UNLIKELY(ret)) return ret; rxm_conn = container_of(handle, struct rxm_conn, handle); if (flags & FI_INJECT) return rxm_ep_rma_inject(rxm_conn->msg_ep, rxm_ep, msg, flags); else return rxm_ep_rma_common(rxm_conn->msg_ep, rxm_ep, msg, flags, fi_writemsg, FI_WRITE); } static ssize_t rxm_ep_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = ofi_total_iov_len(iov, count), .key = key, }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = data, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep) | FI_REMOTE_CQ_DATA); } static ssize_t rxm_ep_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = NULL, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, (rxm_ep_tx_flags(rxm_ep) & ~FI_COMPLETION) | FI_INJECT); } static ssize_t rxm_ep_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = NULL, .data = data, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, (rxm_ep_tx_flags(rxm_ep) & ~FI_COMPLETION) | FI_INJECT | FI_REMOTE_CQ_DATA); } struct fi_ops_rma rxm_ops_rma = { .size = sizeof (struct fi_ops_rma), .read = rxm_ep_read, .readv = rxm_ep_readv, .readmsg = rxm_ep_readmsg, .write = rxm_ep_write, .writev = rxm_ep_writev, .writemsg = rxm_ep_writemsg, .inject = rxm_ep_inject_write, .writedata = rxm_ep_writedata, .injectdata = rxm_ep_inject_writedata, };
int _gnix_amo_post_req(void *data) { struct gnix_fab_req *fab_req = (struct gnix_fab_req *)data; struct gnix_fid_ep *ep = fab_req->gnix_ep; struct gnix_nic *nic = ep->nic; struct gnix_fid_mem_desc *loc_md; struct gnix_tx_descriptor *txd; gni_mem_handle_t mdh; gni_return_t status; int rc; int inject_err = _gnix_req_inject_err(fab_req); if (!gnix_ops_allowed(ep, fab_req->vc->peer_caps, fab_req->flags)) { GNIX_DEBUG(FI_LOG_EP_DATA, "flags:0x%llx, %s\n", fab_req->flags, fi_tostr(&fab_req->flags, FI_TYPE_OP_FLAGS)); GNIX_DEBUG(FI_LOG_EP_DATA, "caps:0x%llx, %s\n", ep->caps, fi_tostr(&ep->caps, FI_TYPE_CAPS)); GNIX_DEBUG(FI_LOG_EP_DATA, "peer_caps:0x%llx, %s\n", fab_req->vc->peer_caps, fi_tostr(&fab_req->vc->peer_caps, FI_TYPE_OP_FLAGS)); rc = __gnix_amo_post_err(fab_req, FI_EOPNOTSUPP); if (rc != FI_SUCCESS) GNIX_WARN(FI_LOG_EP_DATA, "__gnix_amo_post_err() failed: %d\n", rc); return -FI_ECANCELED; } rc = _gnix_nic_tx_alloc(nic, &txd); if (rc) { GNIX_INFO(FI_LOG_EP_DATA, "_gnix_nic_tx_alloc() failed: %d\n", rc); return -FI_ENOSPC; } txd->completer_fn = __gnix_amo_txd_complete; txd->req = fab_req; /* Mem handle CRC is not validated during FMA operations. Skip this * costly calculation. */ _gnix_convert_key_to_mhdl_no_crc( (gnix_mr_key_t *)&fab_req->amo.rem_mr_key, &mdh); loc_md = (struct gnix_fid_mem_desc *)fab_req->amo.loc_md; txd->gni_desc.type = GNI_POST_AMO; txd->gni_desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT; /* check flags */ txd->gni_desc.dlvr_mode = GNI_DLVMODE_PERFORMANCE; /* check flags */ txd->gni_desc.local_addr = (uint64_t)fab_req->amo.loc_addr; if (loc_md) { txd->gni_desc.local_mem_hndl = loc_md->mem_hndl; } txd->gni_desc.remote_addr = (uint64_t)fab_req->amo.rem_addr; txd->gni_desc.remote_mem_hndl = mdh; txd->gni_desc.length = fab_req->amo.len; txd->gni_desc.rdma_mode = 0; /* check flags */ txd->gni_desc.src_cq_hndl = nic->tx_cq; /* check flags */ txd->gni_desc.amo_cmd = _gnix_atomic_cmd(fab_req->amo.datatype, fab_req->amo.op, fab_req->type); txd->gni_desc.first_operand = fab_req->amo.first_operand; txd->gni_desc.second_operand = fab_req->amo.second_operand; GNIX_DEBUG(FI_LOG_EP_DATA, "fo:%016lx so:%016lx\n", txd->gni_desc.first_operand, txd->gni_desc.second_operand); GNIX_DEBUG(FI_LOG_EP_DATA, "amo_cmd:%x\n", txd->gni_desc.amo_cmd); GNIX_LOG_DUMP_TXD(txd); COND_ACQUIRE(nic->requires_lock, &nic->lock); if (OFI_UNLIKELY(inject_err)) { _gnix_nic_txd_err_inject(nic, txd); status = GNI_RC_SUCCESS; } else { status = GNI_PostFma(fab_req->vc->gni_ep, &txd->gni_desc); } COND_RELEASE(nic->requires_lock, &nic->lock); if (status != GNI_RC_SUCCESS) { _gnix_nic_tx_free(nic, txd); GNIX_INFO(FI_LOG_EP_DATA, "GNI_Post*() failed: %s\n", gni_err_str[status]); } return gnixu_to_fi_errno(status); }
static int rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len) { int ret; struct rxm_recv_entry *recv_entry = rx_buf->recv_entry; if (OFI_UNLIKELY(done_len < rx_buf->pkt.hdr.size)) { ret = rxm_cq_write_error_trunc(rx_buf, done_len); if (ret) return ret; } else { if (rx_buf->recv_entry->flags & FI_COMPLETION || rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) { ret = rxm_cq_write_recv_comp( rx_buf, rx_buf->recv_entry->context, rx_buf->recv_entry->comp_flags | rxm_cq_get_rx_comp_flags(rx_buf), rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov[0].iov_base); if (ret) return ret; } ofi_ep_rx_cntr_inc(&rx_buf->ep->util_ep); } if (rx_buf->recv_entry->flags & FI_MULTI_RECV) { struct rxm_iov rxm_iov; size_t recv_size = rx_buf->pkt.hdr.size; struct rxm_ep *rxm_ep = rx_buf->ep; rxm_rx_buf_release(rxm_ep, rx_buf); recv_entry->total_len -= recv_size; if (recv_entry->total_len <= rxm_ep->min_multi_recv_size) { FI_DBG(&rxm_prov, FI_LOG_CQ, "Buffer %p has been completely consumed. " "Reporting Multi-Recv completion\n", recv_entry->multi_recv.buf); ret = rxm_cq_write_multi_recv_comp(rxm_ep, recv_entry); if (OFI_UNLIKELY(ret)) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to write FI_MULTI_RECV completion\n"); return ret; } /* Since buffer is elapsed, release recv_entry */ rxm_recv_entry_release(recv_entry->recv_queue, recv_entry); return ret; } FI_DBG(&rxm_prov, FI_LOG_CQ, "Repost Multi-Recv entry: %p " "consumed len = %zu, remain len = %zu\n", recv_entry, recv_size, recv_entry->total_len); rxm_iov = recv_entry->rxm_iov; ret = rxm_match_iov(/* prev iovecs */ rxm_iov.iov, rxm_iov.desc, rxm_iov.count, recv_size, /* offset */ recv_entry->total_len, /* match_len */ &recv_entry->rxm_iov); /* match_iov */ if (OFI_UNLIKELY(ret)) return ret; return rxm_process_recv_entry(recv_entry->recv_queue, recv_entry); } else { rxm_rx_buf_release(rx_buf->ep, rx_buf); rxm_recv_entry_release(recv_entry->recv_queue, recv_entry); } return FI_SUCCESS; }
static inline struct efa_conn *efa_av_map_addr_to_conn(struct efa_av *av, fi_addr_t addr) { if (OFI_UNLIKELY(addr == FI_ADDR_UNSPEC)) return NULL; return (struct efa_conn *)(void *)addr; }
static ssize_t fi_ibv_rdm_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_conn *conn = ep->av->addr_to_conn(ep, msg->addr); struct fi_ibv_rdm_rma_start_data start_data = { .ep_rdm = ep, .conn = conn, .context = msg->context, .flags = FI_RMA | FI_READ | GET_TX_COMP_FLAG(ep, flags), .data_len = (uint64_t)msg->msg_iov[0].iov_len, .rbuf = (uintptr_t)msg->rma_iov[0].addr, .lbuf = (uintptr_t)msg->msg_iov[0].iov_base, .rkey = (uint64_t)(uintptr_t)(msg->rma_iov[0].key), .lkey = (uint64_t)(uintptr_t)(msg->desc ? msg->desc[0] : NULL), .op_code = IBV_WR_RDMA_READ }; struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; struct fi_ibv_rdm_buf *rdm_buf = NULL; ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_request *request; if(msg->iov_count != 1 || msg->rma_iov_count != 1) { assert(0); return -FI_EMSGSIZE; } ret = fi_ibv_rdm_ep_rma_preinit((void**)&start_data.lkey, &rdm_buf, msg->msg_iov[0].iov_len, conn, ep); if (ret) { return ret; } request = util_buf_alloc(fi_ibv_rdm_request_pool); if (OFI_UNLIKELY(!request)) return -FI_EAGAIN; FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->state.err = FI_SUCCESS; request->minfo.is_tagged = 0; request->rmabuf = rdm_buf; fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); return fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_POST_READY, &post_ready_data); } static ssize_t fi_ibv_rdm_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); struct fi_rma_iov rma_iov = { .addr = addr, .len = 0, .key = key }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } return fi_ibv_rdm_ep_rma_readmsg(ep, &msg, GET_TX_COMP(ep_rdm)); } static ssize_t fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { const struct iovec iov = { .iov_base = buf, .iov_len = len }; return fi_ibv_rdm_ep_rma_readv(ep_fid, &iov, &desc, 1, src_addr, addr, key, context); } static ssize_t fi_ibv_rdm_ep_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_conn *conn = ep->av->addr_to_conn(ep, msg->addr); struct fi_ibv_rdm_request *request = NULL; struct fi_ibv_rdm_buf *rdm_buf = NULL; ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep, .context = msg->context, .flags = FI_RMA | FI_WRITE | (ep->tx_selective_completion ? (flags & FI_COMPLETION) : FI_COMPLETION), .data_len = (uint64_t)msg->msg_iov[0].iov_len, .rbuf = msg->rma_iov[0].addr, .lbuf = (uintptr_t)msg->msg_iov[0].iov_base, .rkey = msg->rma_iov[0].key, .lkey = (uint64_t)(uintptr_t)(msg->desc ? msg->desc[0] : NULL), .op_code = IBV_WR_RDMA_WRITE }; if(msg->iov_count != 1 && msg->rma_iov_count != 1) { assert(0); return -FI_EMSGSIZE; } ret = fi_ibv_rdm_ep_rma_preinit((void**)&start_data.lkey, &rdm_buf, msg->msg_iov[0].iov_len, conn, ep); if (ret) { return ret; } request = util_buf_alloc(fi_ibv_rdm_request_pool); if (OFI_UNLIKELY(!request)) return -FI_EAGAIN; /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->state.err = FI_SUCCESS; request->minfo.is_tagged = 0; request->rmabuf = rdm_buf; FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; return fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_POST_READY, &post_ready_data); } static ssize_t fi_ibv_rdm_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = 0, .key = key }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_ibv_rdm_ep *ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); return fi_ibv_rdm_ep_rma_writemsg(ep_fid, &msg, GET_TX_COMP(ep_rdm)); } static ssize_t fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { const struct iovec iov = { .iov_base = (void *)buf, .iov_len = len }; return fi_ibv_rdm_ep_rma_writev(ep_fid, &iov, &desc, 1, dest_addr, addr, key, context); } static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_conn *conn = ep_rdm->av->addr_to_conn(ep_rdm, dest_addr); struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep_rdm, .flags = 0, /* inject does not generate completion */ .data_len = (uint64_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint64_t)key, .lkey = 0 }; ssize_t ret; struct fi_ibv_rdm_request *request = util_buf_alloc(fi_ibv_rdm_request_pool); if (OFI_UNLIKELY(!request)) return -FI_EAGAIN; FI_IBV_RDM_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_RMA_INJECT; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->state.err = FI_SUCCESS; request->minfo.is_tagged = 0; ret = fi_ibv_rdm_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); switch (ret) { case FI_SUCCESS: return ret; case -FI_EAGAIN: break; default: ret = -errno; break; } FI_IBV_RDM_DBG_REQUEST("to_pool: ", request, FI_LOG_DEBUG); util_buf_release(fi_ibv_rdm_request_pool, request); fi_ibv_rdm_tagged_poll(ep_rdm); return ret; } struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_ibv_rdm_ep_rma_read, .readv = fi_ibv_rdm_ep_rma_readv, .readmsg = fi_ibv_rdm_ep_rma_readmsg, .write = fi_ibv_rdm_ep_rma_write, .writev = fi_ibv_rdm_ep_rma_writev, .writemsg = fi_ibv_rdm_ep_rma_writemsg, .inject = fi_ibv_rdm_ep_rma_inject_write, .writedata = fi_no_rma_writedata, .injectdata = fi_no_rma_injectdata, }; struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma() { return &fi_ibv_rdm_ep_rma_ops; }