static inline ssize_t fi_ibv_rdm_ep_rma_preinit(void **desc, struct fi_ibv_rdm_buf **rdm_buf, size_t len, struct fi_ibv_rdm_conn *conn, struct fi_ibv_rdm_ep *ep) { assert(desc && rdm_buf); if (*desc == NULL && len < ep->rndv_threshold) { *rdm_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (*rdm_buf) { *desc = (void*)(uintptr_t)conn->rma_mr->lkey; } else { goto again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || RMA_RESOURCES_IS_BUSY(conn, ep) || conn->postponed_entry) { goto again; } return FI_SUCCESS; again: fi_ibv_rdm_tagged_poll(ep); return -FI_EAGAIN; }
static ssize_t fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) src_addr; void *raw_buf = NULL; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (raw_buf) { desc = (void*)(uintptr_t)conn->rma_mr->lkey; again = 0; } } if (again) { goto out_again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || RMA_RESOURCES_IS_BUSY(conn, ep)) { /* * TODO: Should be postponed queue flow for RMA be implemented? */ goto out_again; } struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->rmabuf = raw_buf; struct fi_ibv_rdm_rma_start_data start_data = { .ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid), .conn = (struct fi_ibv_rdm_tagged_conn *) src_addr, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_READ }; fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY, &post_ready_data); out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { if(msg->iov_count == 1 && msg->rma_iov_count == 1) { return fi_ibv_rdm_ep_rma_read(ep, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, msg->desc[0], msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key, msg->context); } assert(0); return -FI_EMSGSIZE; } static ssize_t fi_ibv_rdm_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = src_addr, .len = 0, .key = key }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; return fi_ibv_rdm_ep_rma_readmsg(ep, &msg, 0); } static ssize_t fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; void *raw_buf = NULL; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (raw_buf) { memcpy (raw_buf, buf, len); desc = (void*)(uintptr_t)conn->rma_mr->lkey; again = 0; } } if (again) { goto out_again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || SEND_RESOURCES_IS_BUSY(conn, ep)) { /* * TODO: Should be postponed queue flow for RMA be implemented? */ goto out_again; } struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->rmabuf = raw_buf; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_WRITE }; fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY, &post_ready_data); ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER; out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { if(msg->iov_count == 1 && msg->rma_iov_count == 1) { return fi_ibv_rdm_ep_rma_write(ep, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, msg->desc[0], msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key, msg->context); } assert(0); return -FI_EMSGSIZE; } static ssize_t fi_ibv_rdm_ep_rma_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = dest_addr, .len = 0, .key = key }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; return fi_ibv_rdm_ep_rma_writemsg(ep, &msg, 0); } static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; struct fi_ibv_rdm_tagged_request *request = NULL; int ret = FI_EP_RDM_HNDL_AGAIN; if (len >= ep_rdm->rndv_threshold) { return -FI_EMSGSIZE; } if (fi_ibv_rdm_check_connection(conn, ep_rdm) && !RMA_RESOURCES_IS_BUSY(conn, ep_rdm) && !conn->postponed_entry) { request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_RMA_INJECT; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep_rdm, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = 0 }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); } switch (ret) { case FI_EP_RDM_HNDL_SUCCESS: return ret; case FI_EP_RDM_HNDL_AGAIN: ret = -FI_EAGAIN; break; default: ret = -errno; break; } if (request) { FI_IBV_RDM_TAGGED_DBG_REQUEST("to_pool: ", request, FI_LOG_DEBUG); util_buf_release(fi_ibv_rdm_tagged_request_pool, request); } fi_ibv_rdm_tagged_poll(ep_rdm); return ret; } static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_ibv_rdm_ep_rma_read, .readv = fi_ibv_rdm_ep_rma_readv, .readmsg = fi_ibv_rdm_ep_rma_readmsg, .write = fi_ibv_rdm_ep_rma_write, .writev = fi_ibv_rdm_ep_rma_writev, .writemsg = fi_ibv_rdm_ep_rma_writemsg, .inject = fi_ibv_rdm_ep_rma_inject_write, .writedata = fi_no_rma_writedata, .injectdata = fi_no_rma_injectdata, }; struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma(struct fi_ibv_rdm_ep *ep) { return &fi_ibv_rdm_ep_rma_ops; }