struct rxd_x_entry *rxd_rx_entry_init(struct rxd_ep *ep, const struct iovec *iov, size_t iov_count, uint64_t tag, uint64_t ignore, void *context, fi_addr_t addr, uint32_t op, uint32_t flags) { struct rxd_x_entry *rx_entry; rx_entry = rxd_get_rx_entry(ep, op); if (!rx_entry) { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "could not get rx entry\n"); return NULL; } rx_entry->peer = addr; rx_entry->flags = flags; rx_entry->bytes_done = 0; rx_entry->offset = 0; rx_entry->next_seg_no = 0; rx_entry->iov_count = iov_count; rx_entry->op = op; rx_entry->ignore = ignore; memcpy(rx_entry->iov, iov, sizeof(*rx_entry->iov) * iov_count); rx_entry->cq_entry.op_context = context; rx_entry->cq_entry.len = ofi_total_iov_len(iov, iov_count); rx_entry->cq_entry.buf = iov[0].iov_base; rx_entry->cq_entry.tag = tag; rx_entry->cq_entry.flags = ofi_rx_cq_flags(op); dlist_init(&rx_entry->entry); return rx_entry; }
/* Get a match_iov derived from iov whose size matches given length */ static int rxm_match_iov(const struct iovec *iov, void **desc, uint8_t count, uint64_t offset, size_t match_len, struct rxm_iov *match_iov) { uint8_t i; assert(count <= RXM_IOV_LIMIT); for (i = 0; i < count; i++) { if (offset >= iov[i].iov_len) { offset -= iov[i].iov_len; continue; } match_iov->iov[i].iov_base = (char *)iov[i].iov_base + offset; match_iov->iov[i].iov_len = MIN(iov[i].iov_len - offset, match_len); if (desc) match_iov->desc[i] = desc[i]; match_len -= match_iov->iov[i].iov_len; if (!match_len) break; offset = 0; } if (match_len) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Given iov size (%zu) < match_len (remained match_len = %zu)!\n", ofi_total_iov_len(iov, count), match_len); return -FI_ETOOSMALL; } match_iov->count = i + 1; return FI_SUCCESS; }
static ssize_t mrail_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { return mrail_send_common(ep_fid, msg->msg_iov, msg->desc, msg->iov_count, ofi_total_iov_len(msg->msg_iov, msg->iov_count), msg->addr, msg->data, msg->context, flags | mrail_comp_flag(ep_fid)); }
// TODO go for separate recv functions (recvmsg, recvv, etc) to be optimal static ssize_t mrail_recv_common(struct mrail_ep *mrail_ep, struct mrail_recv_queue *recv_queue, struct iovec *iov, size_t count, void *context, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, uint64_t flags, uint64_t comp_flags) { struct mrail_recv *recv; struct mrail_unexp_msg_entry *unexp_msg_entry; recv = mrail_pop_recv(mrail_ep); if (!recv) return -FI_EAGAIN; recv->count = count + 1; recv->context = context; recv->flags = flags; recv->comp_flags |= comp_flags; recv->addr = src_addr; recv->tag = tag; recv->ignore = ignore; memcpy(&recv->iov[1], iov, sizeof(*iov) * count); FI_DBG(&mrail_prov, FI_LOG_EP_DATA, "Posting recv of length: %zu " "src_addr: 0x%" PRIx64 " tag: 0x%" PRIx64 " ignore: 0x%" PRIx64 "\n", ofi_total_iov_len(iov, count), recv->addr, recv->tag, recv->ignore); ofi_ep_lock_acquire(&mrail_ep->util_ep); unexp_msg_entry = container_of(dlist_remove_first_match( &recv_queue->unexp_msg_list, recv_queue->match_unexp, recv), struct mrail_unexp_msg_entry, entry); if (!unexp_msg_entry) { dlist_insert_tail(&recv->entry, &recv_queue->recv_list); ofi_ep_lock_release(&mrail_ep->util_ep); return 0; } ofi_ep_lock_release(&mrail_ep->util_ep); FI_DBG(recv_queue->prov, FI_LOG_EP_DATA, "Match for posted recv" " with addr: 0x%" PRIx64 ", tag: 0x%" PRIx64 " ignore: " "0x%" PRIx64 " found in unexpected msg queue\n", recv->addr, recv->tag, recv->ignore); return mrail_cq_process_buf_recv((struct fi_cq_tagged_entry *) unexp_msg_entry->data, recv); }
ssize_t rxd_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct rxd_ep *ep; struct fi_rma_iov rma_iov; ep = container_of(ep_fid, struct rxd_ep, util_ep.ep_fid.fid); rma_iov.addr = addr; rma_iov.len = ofi_total_iov_len(iov, count); rma_iov.key = key; return rxd_generic_rma(ep, iov, count, &rma_iov, 1, desc, dest_addr, context, ofi_op_write, 0, rxd_ep_tx_flags(ep)); }
static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep, const struct iovec *iov, size_t iov_count, const struct fi_rma_iov *rma_iov, size_t rma_count, fi_addr_t addr, void *context, uint32_t op, uint64_t data, uint32_t rxd_flags) { struct rxd_x_entry *tx_entry; fi_addr_t rxd_addr; ssize_t ret = -FI_EAGAIN; assert(iov_count <= RXD_IOV_LIMIT && rma_count <= RXD_IOV_LIMIT); assert(ofi_total_iov_len(iov, iov_count) <= rxd_ep_domain(rxd_ep)->max_inline_rma); fastlock_acquire(&rxd_ep->util_ep.lock); fastlock_acquire(&rxd_ep->util_ep.tx_cq->cq_lock); if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq)) goto out; rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr]; ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr); if (ret) goto out; tx_entry = rxd_tx_entry_init(rxd_ep, iov, iov_count, NULL, 0, rma_count, data, 0, context, rxd_addr, op, rxd_flags); if (!tx_entry) goto out; ret = rxd_ep_send_op(rxd_ep, tx_entry, rma_iov, rma_count, NULL, 0, 0, 0); if (ret) { rxd_tx_entry_free(rxd_ep, tx_entry); goto out; } if (tx_entry->op == RXD_READ_REQ) goto out; ret = 0; out: fastlock_release(&rxd_ep->util_ep.tx_cq->cq_lock); fastlock_release(&rxd_ep->util_ep.lock); return ret; }
static void smr_format_inject_atomic(struct smr_cmd *cmd, fi_addr_t peer_id, const struct iovec *iov, size_t count, const struct iovec *resultv, size_t result_count, const struct iovec *compv, size_t comp_count, uint32_t op, enum fi_datatype datatype, enum fi_op atomic_op, struct smr_region *smr, struct smr_inject_buf *tx_buf) { size_t comp_size; smr_generic_format(cmd, peer_id, op, 0, datatype, atomic_op, 0, 0); cmd->msg.hdr.op_src = smr_src_inject; cmd->msg.hdr.src_data = (char **) tx_buf - (char **) smr; switch (op) { case ofi_op_atomic: case ofi_op_atomic_fetch: if (atomic_op == FI_ATOMIC_READ) cmd->msg.hdr.size = ofi_total_iov_len(resultv, result_count); else cmd->msg.hdr.size = ofi_copy_from_iov(tx_buf->data, SMR_INJECT_SIZE, iov, count, 0); break; case ofi_op_atomic_compare: cmd->msg.hdr.size = ofi_copy_from_iov(tx_buf->buf, SMR_COMP_INJECT_SIZE, iov, count, 0); comp_size = ofi_copy_from_iov(tx_buf->comp, SMR_COMP_INJECT_SIZE, compv, comp_count, 0); if (comp_size != cmd->msg.hdr.size) FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "atomic and compare buffer size mimatch\n"); break; default: break; } }
static ssize_t rxm_ep_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); struct fi_rma_iov rma_iov = { .addr = addr, .len = ofi_total_iov_len(iov, count), .key = key, }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep), fi_readmsg, FI_READ); } static ssize_t rxm_ep_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep), fi_readmsg, FI_READ); } static inline void rxm_ep_format_rma_msg(struct rxm_rma_buf *rma_buf, const struct fi_msg_rma *orig_msg, struct iovec *rxm_iov, struct fi_msg_rma *rxm_msg) { rxm_msg->context = rma_buf; rxm_msg->addr = orig_msg->addr; rxm_msg->data = orig_msg->data; ofi_copy_from_iov(rma_buf->pkt.data, rma_buf->pkt.hdr.size, orig_msg->msg_iov, orig_msg->iov_count, 0); rxm_iov->iov_base = &rma_buf->pkt.data; rxm_iov->iov_len = rma_buf->pkt.hdr.size; rxm_msg->msg_iov = rxm_iov; rxm_msg->desc = &rma_buf->hdr.desc; rxm_msg->iov_count = 1; rxm_msg->rma_iov = orig_msg->rma_iov; rxm_msg->rma_iov_count = orig_msg->rma_iov_count; } static inline ssize_t rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, size_t total_size, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_rma_buf *rma_buf; ssize_t ret; struct iovec rxm_msg_iov = { 0 }; struct fi_msg_rma rxm_rma_msg = { 0 }; assert(msg->rma_iov_count <= rxm_ep->rxm_info->tx_attr->rma_iov_limit); ofi_ep_lock_acquire(&rxm_ep->util_ep); rma_buf = rxm_rma_buf_alloc(rxm_ep); if (OFI_UNLIKELY(!rma_buf)) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from RMA buffer pool\n"); ret = -FI_ENOMEM; goto unlock; } rma_buf->pkt.hdr.size = total_size; rma_buf->app_context = msg->context; rma_buf->flags = flags; rxm_ep_format_rma_msg(rma_buf, msg, &rxm_msg_iov, &rxm_rma_msg); flags = (flags & ~FI_INJECT) | FI_COMPLETION; ret = fi_writemsg(rxm_conn->msg_ep, &rxm_rma_msg, flags); if (OFI_UNLIKELY(ret)) { if (ret == -FI_EAGAIN) rxm_ep_do_progress(&rxm_ep->util_ep); ofi_buf_free(rma_buf); } unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; } static inline ssize_t rxm_ep_rma_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, uint64_t flags) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = NULL, .data = data, }; return rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, len, &msg, flags); } static inline ssize_t rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_conn *rxm_conn; size_t total_size = ofi_total_iov_len(msg->msg_iov, msg->iov_count); ssize_t ret; assert(total_size <= rxm_ep->rxm_info->tx_attr->inject_size); ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; if ((total_size <= rxm_ep->msg_info->tx_attr->inject_size) && !(flags & FI_COMPLETION) && (msg->iov_count == 1) && (msg->rma_iov_count == 1)) { if (flags & FI_REMOTE_CQ_DATA) { ret = fi_inject_writedata(rxm_conn->msg_ep, msg->msg_iov->iov_base, msg->msg_iov->iov_len, msg->data, msg->addr, msg->rma_iov->addr, msg->rma_iov->key); } else { ret = fi_inject_write(rxm_conn->msg_ep, msg->msg_iov->iov_base, msg->msg_iov->iov_len, msg->addr, msg->rma_iov->addr, msg->rma_iov->key); } if (OFI_LIKELY(!ret)) { ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); } else { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_write* for MSG provider failed with ret - %" PRId64"\n", ret); if (OFI_LIKELY(ret == -FI_EAGAIN)) rxm_ep_progress(&rxm_ep->util_ep); } return ret; } else { return rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, total_size, msg, flags); } } static inline ssize_t rxm_ep_generic_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); if (flags & FI_INJECT) return rxm_ep_rma_inject_common(rxm_ep, msg, flags); else return rxm_ep_rma_common(rxm_ep, msg, flags, fi_writemsg, FI_WRITE); } static inline ssize_t rxm_ep_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_writemsg(ep_fid, msg, flags | rxm_ep->util_ep.tx_msg_flags); } static ssize_t rxm_ep_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = ofi_total_iov_len(iov, count), .key = key, }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = data, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep) | FI_REMOTE_CQ_DATA); } static ssize_t rxm_ep_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { ssize_t ret; struct rxm_conn *rxm_conn; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; if (len <= rxm_ep->msg_info->tx_attr->inject_size) { ret = fi_inject_write(rxm_conn->msg_ep, buf, len, dest_addr, addr, key); if (OFI_LIKELY(!ret)) { ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); } else { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_write for MSG provider failed with ret - %" PRId64"\n", ret); if (OFI_LIKELY(ret == -FI_EAGAIN)) rxm_ep_progress(&rxm_ep->util_ep); } return ret; } else { return rxm_ep_rma_emulate_inject(rxm_ep, rxm_conn, buf, len, 0, dest_addr, addr, key, FI_INJECT); } } static ssize_t rxm_ep_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { ssize_t ret; struct rxm_conn *rxm_conn; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); if (OFI_UNLIKELY(ret)) return ret; if (len <= rxm_ep->msg_info->tx_attr->inject_size) { ret = fi_inject_writedata(rxm_conn->msg_ep, buf, len, data, dest_addr, addr, key); if (OFI_LIKELY(!ret)) { ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); } else { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_writedata for MSG provider failed with ret - %" PRId64"\n", ret); if (OFI_LIKELY(ret == -FI_EAGAIN)) rxm_ep_progress(&rxm_ep->util_ep); } return ret; } else { return rxm_ep_rma_emulate_inject(rxm_ep, rxm_conn, buf, len, data, dest_addr, addr, key, FI_REMOTE_CQ_DATA | FI_INJECT); } } struct fi_ops_rma rxm_ops_rma = { .size = sizeof (struct fi_ops_rma), .read = rxm_ep_read, .readv = rxm_ep_readv, .readmsg = rxm_ep_readmsg, .write = rxm_ep_write, .writev = rxm_ep_writev, .writemsg = rxm_ep_writemsg, .inject = rxm_ep_inject_write, .writedata = rxm_ep_writedata, .injectdata = rxm_ep_inject_writedata, };
struct rxd_x_entry *rxd_tx_entry_init(struct rxd_ep *ep, const struct iovec *iov, size_t iov_count, const struct iovec *res_iov, size_t res_count, size_t rma_count, uint64_t data, uint64_t tag, void *context, fi_addr_t addr, uint32_t op, uint32_t flags) { struct rxd_x_entry *tx_entry; struct rxd_domain *rxd_domain = rxd_ep_domain(ep); size_t max_inline; tx_entry = rxd_get_tx_entry(ep, op); if (!tx_entry) { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "could not get tx entry\n"); return NULL; } tx_entry->op = op; tx_entry->peer = addr; tx_entry->flags = flags; tx_entry->bytes_done = 0; tx_entry->offset = 0; tx_entry->next_seg_no = 0; tx_entry->iov_count = iov_count; memcpy(&tx_entry->iov[0], iov, sizeof(*iov) * iov_count); if (res_count) { tx_entry->res_count = res_count; memcpy(&tx_entry->res_iov[0], res_iov, sizeof(*res_iov) * res_count); } tx_entry->cq_entry.op_context = context; tx_entry->cq_entry.len = ofi_total_iov_len(iov, iov_count); tx_entry->cq_entry.buf = iov[0].iov_base; tx_entry->cq_entry.flags = ofi_tx_cq_flags(op); tx_entry->cq_entry.tag = tag; tx_entry->pkt = NULL; max_inline = rxd_domain->max_inline_msg; if (tx_entry->cq_entry.flags & FI_RMA) max_inline -= sizeof(struct ofi_rma_iov) * rma_count; if (tx_entry->flags & RXD_TAG_HDR) max_inline -= sizeof(tx_entry->cq_entry.tag); if (tx_entry->flags & RXD_REMOTE_CQ_DATA) { max_inline -= sizeof(tx_entry->cq_entry.data); tx_entry->cq_entry.data = data; } if (rma_count > 1 || tx_entry->cq_entry.flags & FI_READ || tx_entry->cq_entry.len > max_inline) max_inline -= sizeof(struct rxd_sar_hdr); else tx_entry->flags |= RXD_INLINE; if (tx_entry->cq_entry.flags & FI_ATOMIC || tx_entry->cq_entry.len <= max_inline) tx_entry->num_segs = 1; else if (tx_entry->cq_entry.flags & FI_READ) tx_entry->num_segs = ofi_div_ceil(tx_entry->cq_entry.len, rxd_domain->max_seg_sz); else tx_entry->num_segs = ofi_div_ceil(tx_entry->cq_entry.len - max_inline, rxd_domain->max_seg_sz) + 1; if ((tx_entry->op == RXD_READ_REQ || tx_entry->op == RXD_ATOMIC_FETCH || tx_entry->op == RXD_ATOMIC_COMPARE) && ep->peers[tx_entry->peer].unacked_cnt < ep->peers[tx_entry->peer].tx_window && ep->peers[tx_entry->peer].peer_addr != FI_ADDR_UNSPEC) dlist_insert_tail(&tx_entry->entry, &ep->peers[tx_entry->peer].rma_rx_list); else dlist_insert_tail(&tx_entry->entry, &ep->peers[tx_entry->peer].tx_list); return tx_entry; }
static ssize_t rxm_ep_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = ofi_total_iov_len(iov, count), .key = key, }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_readmsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = src_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_readmsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_rma_inject(struct fid_ep *msg_ep, struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags) { struct rxm_tx_entry *tx_entry; struct rxm_tx_buf *tx_buf; struct fi_msg_rma msg_rma; struct iovec iov; size_t size; ssize_t ret; size = ofi_total_iov_len(msg->msg_iov, msg->iov_count); if (size > rxm_ep->rxm_info->tx_attr->inject_size) return -FI_EMSGSIZE; /* Use fi_inject_write instead of fi_writemsg since the latter generates * completion by default */ if (size <= rxm_ep->msg_info->tx_attr->inject_size && !(flags & FI_COMPLETION)) { if (flags & FI_REMOTE_CQ_DATA) return fi_inject_writedata(msg_ep, msg->msg_iov->iov_base, msg->msg_iov->iov_len, msg->data, msg->addr, msg->rma_iov->addr, msg->rma_iov->key); else return fi_inject_write(msg_ep, msg->msg_iov->iov_base, msg->msg_iov->iov_len, msg->addr, msg->rma_iov->addr, msg->rma_iov->key); } tx_buf = rxm_tx_buf_get(rxm_ep, RXM_BUF_POOL_TX_MSG); if (!tx_buf) { FI_WARN(&rxm_prov, FI_LOG_CQ, "TX queue full!\n"); rxm_ep_progress_multi(&rxm_ep->util_ep); return -FI_EAGAIN; } tx_entry = rxm_tx_entry_get(&rxm_ep->send_queue); if (!tx_entry) { rxm_ep_progress_multi(&rxm_ep->util_ep); ret = -FI_EAGAIN; goto err1; } tx_entry->state = RXM_TX; tx_entry->flags = flags; tx_entry->comp_flags = FI_RMA | FI_WRITE; tx_entry->tx_buf = tx_buf; ofi_copy_from_iov(tx_buf->pkt.data, size, msg->msg_iov, msg->iov_count, 0); iov.iov_base = &tx_buf->pkt.data; iov.iov_len = size; msg_rma.msg_iov = &iov; msg_rma.desc = &tx_buf->hdr.desc; msg_rma.iov_count = 1; msg_rma.addr = msg->addr; msg_rma.rma_iov = msg->rma_iov; msg_rma.rma_iov_count = msg->rma_iov_count; msg_rma.context = tx_entry; msg_rma.data = msg->data; flags = (flags & ~FI_INJECT) | FI_COMPLETION; ret = fi_writemsg(msg_ep, &msg_rma, flags); if (ret) { if (ret == -FI_EAGAIN) rxm_ep_progress_multi(&rxm_ep->util_ep); goto err2; } return 0; err2: rxm_tx_entry_release(&rxm_ep->send_queue, tx_entry); err1: rxm_tx_buf_release(rxm_ep, tx_buf); return ret; } static ssize_t rxm_ep_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct util_cmap_handle *handle; struct rxm_conn *rxm_conn; struct rxm_ep *rxm_ep; int ret; rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ret = ofi_cmap_get_handle(rxm_ep->util_ep.cmap, msg->addr, &handle); if (OFI_UNLIKELY(ret)) return ret; rxm_conn = container_of(handle, struct rxm_conn, handle); if (flags & FI_INJECT) return rxm_ep_rma_inject(rxm_conn->msg_ep, rxm_ep, msg, flags); else return rxm_ep_rma_common(rxm_conn->msg_ep, rxm_ep, msg, flags, fi_writemsg, FI_WRITE); } static ssize_t rxm_ep_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = ofi_total_iov_len(iov, count), .key = key, }; struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = data, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep) | FI_REMOTE_CQ_DATA); } static ssize_t rxm_ep_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = &desc, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); } static ssize_t rxm_ep_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = NULL, .data = 0, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, (rxm_ep_tx_flags(rxm_ep) & ~FI_COMPLETION) | FI_INJECT); } static ssize_t rxm_ep_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_rma_iov rma_iov = { .addr = addr, .len = len, .key = key, }; struct iovec iov = { .iov_base = (void*)buf, .iov_len = len, }; struct fi_msg_rma msg = { .msg_iov = &iov, .desc = NULL, .iov_count = 1, .addr = dest_addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = NULL, .data = data, }; struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_writemsg(ep_fid, &msg, (rxm_ep_tx_flags(rxm_ep) & ~FI_COMPLETION) | FI_INJECT | FI_REMOTE_CQ_DATA); } struct fi_ops_rma rxm_ops_rma = { .size = sizeof (struct fi_ops_rma), .read = rxm_ep_read, .readv = rxm_ep_readv, .readmsg = rxm_ep_readmsg, .write = rxm_ep_write, .writev = rxm_ep_writev, .writemsg = rxm_ep_writemsg, .inject = rxm_ep_inject_write, .writedata = rxm_ep_writedata, .injectdata = rxm_ep_inject_writedata, };
static ssize_t mrail_send(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { struct iovec iov = { .iov_base = (void *)buf, .iov_len = len, }; return mrail_send_common(ep_fid, &iov, &desc, 1, len, dest_addr, 0, context, mrail_comp_flag(ep_fid)); } static ssize_t mrail_inject(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr) { struct iovec iov = { .iov_base = (void *)buf, .iov_len = len, }; return mrail_send_common(ep_fid, &iov, NULL, 1, len, dest_addr, 0, NULL, mrail_inject_flags(ep_fid)); } static ssize_t mrail_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { struct iovec iov = { .iov_base = (void *)buf, .iov_len = len, }; return mrail_send_common(ep_fid, &iov, NULL, 1, len, dest_addr, data, NULL, (mrail_inject_flags(ep_fid) | FI_REMOTE_CQ_DATA)); } static ssize_t mrail_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, uint64_t flags) { return mrail_tsend_common(ep_fid, msg->msg_iov, msg->desc, msg->iov_count, ofi_total_iov_len(msg->msg_iov, msg->iov_count), msg->addr, msg->tag, msg->data, msg->context, flags | mrail_comp_flag(ep_fid)); } static ssize_t mrail_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t tag, void *context) { struct iovec iov = { .iov_base = (void *)buf, .iov_len = len, }; return mrail_tsend_common(ep_fid, &iov, &desc, 1, len, dest_addr, tag, 0, context, mrail_comp_flag(ep_fid)); } static ssize_t mrail_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) { struct iovec iov = { .iov_base = (void *)buf, .iov_len = len, }; return mrail_tsend_common(ep_fid, &iov, &desc, 1, len, dest_addr, tag, data, context, (mrail_comp_flag(ep_fid) | FI_REMOTE_CQ_DATA)); } static ssize_t mrail_tinject(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag) { struct iovec iov = { .iov_base = (void *)buf, .iov_len = len, }; return mrail_tsend_common(ep_fid, &iov, NULL, 1, len, dest_addr, tag, 0, NULL, mrail_inject_flags(ep_fid)); } static ssize_t mrail_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t tag) { struct iovec iov = { .iov_base = (void *)buf, .iov_len = len, }; return mrail_tsend_common(ep_fid, &iov, NULL, 1, len, dest_addr, tag, data, NULL, (mrail_inject_flags(ep_fid) | FI_REMOTE_CQ_DATA)); } static struct mrail_unexp_msg_entry * mrail_get_unexp_msg_entry(struct mrail_recv_queue *recv_queue, void *context) { // TODO use buf pool // context would be mrail_ep from which u can get the buf pool struct mrail_unexp_msg_entry *unexp_msg_entry = malloc(sizeof(*unexp_msg_entry) + sizeof(struct fi_cq_tagged_entry)); return unexp_msg_entry; } static int mrail_getname(fid_t fid, void *addr, size_t *addrlen) { struct mrail_ep *mrail_ep = container_of(fid, struct mrail_ep, util_ep.ep_fid.fid); struct mrail_domain *mrail_domain = container_of(mrail_ep->util_ep.domain, struct mrail_domain, util_domain); size_t i, offset = 0, rail_addrlen; int ret; if (*addrlen < mrail_domain->addrlen) return -FI_ETOOSMALL; for (i = 0; i < mrail_ep->num_eps; i++) { rail_addrlen = *addrlen - offset; ret = fi_getname(&mrail_ep->rails[i].ep->fid, (char *)addr + offset, &rail_addrlen); if (ret) { FI_WARN(&mrail_prov, FI_LOG_EP_CTRL, "Unable to get name for rail: %zd\n", i); return ret; } offset += rail_addrlen; } return 0; } static void mrail_tx_buf_init(void *pool_ctx, void *buf) { struct mrail_ep *mrail_ep = pool_ctx; struct mrail_tx_buf *tx_buf = buf; tx_buf->ep = mrail_ep; tx_buf->hdr.version = MRAIL_HDR_VERSION; } static void mrail_ep_free_bufs(struct mrail_ep *mrail_ep) { if (mrail_ep->req_pool) util_buf_pool_destroy(mrail_ep->req_pool); if (mrail_ep->ooo_recv_pool) util_buf_pool_destroy(mrail_ep->ooo_recv_pool); if (mrail_ep->tx_buf_pool) util_buf_pool_destroy(mrail_ep->tx_buf_pool); if (mrail_ep->recv_fs) mrail_recv_fs_free(mrail_ep->recv_fs); } static int mrail_ep_alloc_bufs(struct mrail_ep *mrail_ep) { struct util_buf_attr attr = { .size = sizeof(struct mrail_tx_buf), .alignment = sizeof(void *), .max_cnt = 0, .chunk_cnt = 64, .alloc_hndlr = NULL, .free_hndlr = NULL, .init = mrail_tx_buf_init, .ctx = mrail_ep, }; size_t buf_size, rxq_total_size = 0; struct fi_info *fi; int ret; for (fi = mrail_ep->info->next; fi; fi = fi->next) rxq_total_size += fi->rx_attr->size; mrail_ep->recv_fs = mrail_recv_fs_create(rxq_total_size, mrail_init_recv, mrail_ep); if (!mrail_ep->recv_fs) return -FI_ENOMEM; ret = util_buf_pool_create(&mrail_ep->ooo_recv_pool, sizeof(struct mrail_ooo_recv), sizeof(void *), 0, 64); if (!mrail_ep->ooo_recv_pool) goto err; ret = util_buf_pool_create_attr(&attr, &mrail_ep->tx_buf_pool); if (!mrail_ep->tx_buf_pool) goto err; buf_size = (sizeof(struct mrail_req) + (mrail_ep->num_eps * sizeof(struct mrail_subreq))); ret = util_buf_pool_create(&mrail_ep->req_pool, buf_size, sizeof(void *), 0, 64); if (ret) goto err; return 0; err: mrail_ep_free_bufs(mrail_ep); return ret; } static int mrail_ep_close(fid_t fid) { struct mrail_ep *mrail_ep = container_of(fid, struct mrail_ep, util_ep.ep_fid.fid); int ret, retv = 0; size_t i; mrail_ep_free_bufs(mrail_ep); for (i = 0; i < mrail_ep->num_eps; i++) { ret = fi_close(&mrail_ep->rails[i].ep->fid); if (ret) retv = ret; } free(mrail_ep->rails); ret = ofi_endpoint_close(&mrail_ep->util_ep); if (ret) retv = ret; free(mrail_ep); return retv; } static int mrail_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) { struct mrail_ep *mrail_ep = container_of(ep_fid, struct mrail_ep, util_ep.ep_fid.fid); struct mrail_cq *mrail_cq; struct mrail_av *mrail_av; struct util_cntr *cntr; int ret = 0; size_t i; switch (bfid->fclass) { case FI_CLASS_AV: mrail_av = container_of(bfid, struct mrail_av, util_av.av_fid.fid); ret = ofi_ep_bind_av(&mrail_ep->util_ep, &mrail_av->util_av); if (ret) return ret; for (i = 0; i < mrail_ep->num_eps; i++) { ret = fi_ep_bind(mrail_ep->rails[i].ep, &mrail_av->avs[i]->fid, flags); if (ret) return ret; } break; case FI_CLASS_CQ: mrail_cq = container_of(bfid, struct mrail_cq, util_cq.cq_fid.fid); ret = ofi_ep_bind_cq(&mrail_ep->util_ep, &mrail_cq->util_cq, flags); if (ret) return ret; for (i = 0; i < mrail_ep->num_eps; i++) { ret = fi_ep_bind(mrail_ep->rails[i].ep, &mrail_cq->cqs[i]->fid, flags); if (ret) return ret; } break; case FI_CLASS_CNTR: cntr = container_of(bfid, struct util_cntr, cntr_fid.fid); ret = ofi_ep_bind_cntr(&mrail_ep->util_ep, cntr, flags); if (ret) return ret; break; case FI_CLASS_EQ: ret = -FI_ENOSYS; break; default: FI_WARN(&mrail_prov, FI_LOG_EP_CTRL, "invalid fid class\n"); ret = -FI_EINVAL; break; } return ret; } static int mrail_ep_ctrl(struct fid *fid, int command, void *arg) { struct mrail_ep *mrail_ep; size_t i, buf_recv_min = sizeof(struct mrail_hdr); int ret; mrail_ep = container_of(fid, struct mrail_ep, util_ep.ep_fid.fid); switch (command) { case FI_ENABLE: if (!mrail_ep->util_ep.rx_cq || !mrail_ep->util_ep.tx_cq) return -FI_ENOCQ; if (!mrail_ep->util_ep.av) return -FI_ENOAV; for (i = 0; i < mrail_ep->num_eps; i++) { ret = fi_setopt(&mrail_ep->rails[i].ep->fid, FI_OPT_ENDPOINT, FI_OPT_BUFFERED_MIN, &buf_recv_min, sizeof(buf_recv_min)); if (ret) return ret; ret = fi_enable(mrail_ep->rails[i].ep); if (ret) return ret; } break; default: return -FI_ENOSYS; } return 0; } static struct fi_ops mrail_ep_fi_ops = { .size = sizeof(struct fi_ops), .close = mrail_ep_close, .bind = mrail_ep_bind, .control = mrail_ep_ctrl, .ops_open = fi_no_ops_open, }; static int mrail_ep_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen) { struct mrail_ep *mrail_ep; size_t i; int ret = 0; mrail_ep = container_of(fid, struct mrail_ep, util_ep.ep_fid.fid); for (i = 0; i < mrail_ep->num_eps; i++) { ret = fi_setopt(&mrail_ep->rails[i].ep->fid, level, optname, optval, optlen); if (ret) return ret; } return ret; } static struct fi_ops_ep mrail_ops_ep = { .size = sizeof(struct fi_ops_ep), .cancel = fi_no_cancel, .getopt = fi_no_getopt, .setopt = mrail_ep_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, .rx_size_left = fi_no_rx_size_left, .tx_size_left = fi_no_tx_size_left, }; static struct fi_ops_cm mrail_ops_cm = { .size = sizeof(struct fi_ops_cm), .setname = fi_no_setname, .getname = mrail_getname, .getpeer = fi_no_getpeer, .connect = fi_no_connect, .listen = fi_no_listen, .accept = fi_no_accept, .reject = fi_no_reject, .shutdown = fi_no_shutdown, .join = fi_no_join, }; static struct fi_ops_msg mrail_ops_msg = { .size = sizeof(struct fi_ops_msg), .recv = mrail_recv, .recvv = fi_no_msg_recvv, .recvmsg = mrail_recvmsg, .send = mrail_send, .sendv = fi_no_msg_sendv, .sendmsg = mrail_sendmsg, .inject = mrail_inject, .senddata = fi_no_msg_senddata, .injectdata = mrail_injectdata, }; struct fi_ops_tagged mrail_ops_tagged = { .size = sizeof(struct fi_ops_tagged), .recv = mrail_trecv, .recvv = fi_no_tagged_recvv, .recvmsg = mrail_trecvmsg, .send = mrail_tsend, .sendv = fi_no_tagged_sendv, .sendmsg = mrail_tsendmsg, .inject = mrail_tinject, .senddata = mrail_tsenddata, .injectdata = mrail_tinjectdata, }; void mrail_ep_progress(struct util_ep *ep) { struct mrail_ep *mrail_ep; mrail_ep = container_of(ep, struct mrail_ep, util_ep); mrail_progress_deferred_reqs(mrail_ep); } int mrail_ep_open(struct fid_domain *domain_fid, struct fi_info *info, struct fid_ep **ep_fid, void *context) { struct mrail_domain *mrail_domain = container_of(domain_fid, struct mrail_domain, util_domain.domain_fid); struct mrail_ep *mrail_ep; struct fi_info *fi; size_t i; int ret; if (strcmp(mrail_domain->info->domain_attr->name, info->domain_attr->name)) { FI_WARN(&mrail_prov, FI_LOG_EP_CTRL, "info domain name: %s " "doesn't match fid_domain name: %s!\n", info->domain_attr->name, mrail_domain->info->domain_attr->name); return -FI_EINVAL; } mrail_ep = calloc(1, sizeof(*mrail_ep)); if (!mrail_ep) return -FI_ENOMEM; // TODO detect changes b/w mrail_domain->info and info arg // this may be difficult and we may not support such changes mrail_ep->info = mrail_domain->info; mrail_ep->num_eps = mrail_domain->num_domains; ret = ofi_endpoint_init(domain_fid, &mrail_util_prov, info, &mrail_ep->util_ep, context, &mrail_ep_progress); if (ret) { goto free_ep; } mrail_ep->rails = calloc(mrail_ep->num_eps, sizeof(*mrail_ep->rails)); if (!mrail_ep->rails) { ret = -FI_ENOMEM; goto err; } for (i = 0, fi = mrail_ep->info->next; fi; fi = fi->next, i++) { fi->tx_attr->op_flags &= ~FI_COMPLETION; ret = fi_endpoint(mrail_domain->domains[i], fi, &mrail_ep->rails[i].ep, mrail_ep); if (ret) { FI_WARN(&mrail_prov, FI_LOG_EP_CTRL, "Unable to open EP\n"); goto err; } mrail_ep->rails[i].info = fi; } ret = mrail_ep_alloc_bufs(mrail_ep); if (ret) goto err; slist_init(&mrail_ep->deferred_reqs); if (mrail_ep->info->caps & FI_DIRECTED_RECV) { mrail_recv_queue_init(&mrail_prov, &mrail_ep->recv_queue, mrail_match_recv_addr, mrail_match_unexp_addr, mrail_get_unexp_msg_entry); mrail_recv_queue_init(&mrail_prov, &mrail_ep->trecv_queue, mrail_match_recv_addr_tag, mrail_match_unexp_addr_tag, mrail_get_unexp_msg_entry); } else { mrail_recv_queue_init(&mrail_prov, &mrail_ep->recv_queue, mrail_match_recv_any, mrail_match_unexp_any, mrail_get_unexp_msg_entry); mrail_recv_queue_init(&mrail_prov, &mrail_ep->trecv_queue, mrail_match_recv_tag, mrail_match_unexp_tag, mrail_get_unexp_msg_entry); } ofi_atomic_initialize32(&mrail_ep->tx_rail, 0); ofi_atomic_initialize32(&mrail_ep->rx_rail, 0); *ep_fid = &mrail_ep->util_ep.ep_fid; (*ep_fid)->fid.ops = &mrail_ep_fi_ops; (*ep_fid)->ops = &mrail_ops_ep; (*ep_fid)->cm = &mrail_ops_cm; (*ep_fid)->msg = &mrail_ops_msg; (*ep_fid)->tagged = &mrail_ops_tagged; (*ep_fid)->rma = &mrail_ops_rma; return 0; err: mrail_ep_close(&mrail_ep->util_ep.ep_fid.fid); free_ep: free(mrail_ep); return ret; }
static ssize_t rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_iov_count, struct fi_ioc *resultv, void **result_desc, size_t result_iov_count, uint32_t op, uint64_t flags) { struct rxm_tx_atomic_buf *tx_buf; struct rxm_atomic_hdr *atomic_hdr; struct iovec buf_iov[RXM_IOV_LIMIT]; struct iovec cmp_iov[RXM_IOV_LIMIT]; size_t datatype_sz = ofi_datatype_size(msg->datatype); size_t buf_len = 0; size_t cmp_len = 0; size_t tot_len; ssize_t ret; assert(msg->iov_count <= RXM_IOV_LIMIT && msg->rma_iov_count <= RXM_IOV_LIMIT); if (flags & FI_REMOTE_CQ_DATA) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "atomic with remote CQ data not supported\n"); return -FI_EINVAL; } if (msg->op != FI_ATOMIC_READ) { assert(msg->msg_iov); ofi_ioc_to_iov(msg->msg_iov, buf_iov, msg->iov_count, datatype_sz); buf_len = ofi_total_iov_len(buf_iov, msg->iov_count); } if (op == ofi_op_atomic_compare) { assert(comparev); ofi_ioc_to_iov(comparev, cmp_iov, compare_iov_count, datatype_sz); cmp_len = ofi_total_iov_len(cmp_iov, compare_iov_count); assert(buf_len == cmp_len); } tot_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr) + sizeof(struct rxm_pkt); if (tot_len > rxm_eager_limit) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "atomic data too large %zu\n", tot_len); return -FI_EINVAL; } ofi_ep_lock_acquire(&rxm_ep->util_ep); tx_buf = (struct rxm_tx_atomic_buf *) rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC); if (OFI_UNLIKELY(!tx_buf)) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from Atomic buffer pool\n"); ret = -FI_EAGAIN; goto unlock; } rxm_ep_format_atomic_pkt_hdr(rxm_conn, tx_buf, tot_len, op, msg->datatype, msg->op, flags, msg->data, msg->rma_iov, msg->rma_iov_count); tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf); tx_buf->app_context = msg->context; atomic_hdr = (struct rxm_atomic_hdr *) tx_buf->pkt.data; ofi_copy_from_iov(atomic_hdr->data, buf_len, buf_iov, msg->iov_count, 0); if (cmp_len) ofi_copy_from_iov(atomic_hdr->data + buf_len, cmp_len, cmp_iov, compare_iov_count, 0); tx_buf->result_iov_count = result_iov_count; if (resultv) ofi_ioc_to_iov(resultv, tx_buf->result_iov, result_iov_count, datatype_sz); ret = rxm_ep_send_atomic_req(rxm_ep, rxm_conn, tx_buf, tot_len); if (ret) ofi_buf_free(tx_buf); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; }