static ssize_t fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) src_addr; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { void *raw_sbuf = fi_ibv_rdm_tagged_prepare_send_resources(conn, ep); if (raw_sbuf) { memcpy (raw_sbuf, buf, len); buf = raw_sbuf; desc = (void*)(uintptr_t)conn->s_mr->lkey; again = 0; } } if (again) { goto out_again; } } struct fi_ibv_rdm_tagged_request *request = (struct fi_ibv_rdm_tagged_request *) fi_verbs_mem_pool_get(&fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; struct fi_ibv_rdm_rma_start_data data = { .ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid), .conn = (struct fi_ibv_rdm_tagged_conn *) src_addr, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_READ }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &data); ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER; out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { void *raw_sbuf = fi_ibv_rdm_tagged_prepare_send_resources(conn, ep); if (raw_sbuf) { memcpy (raw_sbuf, buf, len); buf = raw_sbuf; desc = (void*)(uintptr_t)conn->s_mr->lkey; again = 0; } } if (again) { goto out_again; } } struct fi_ibv_rdm_tagged_request *request = (struct fi_ibv_rdm_tagged_request *) fi_verbs_mem_pool_get(&fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; struct fi_ibv_rdm_rma_start_data data = { .conn = conn, .ep_rdm = ep, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_WRITE }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &data); ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER; out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); if (len >= ep_rdm->rndv_threshold) { return -FI_EMSGSIZE; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; if (!conn->postponed_entry) { void *raw_sbuf = fi_ibv_rdm_tagged_prepare_send_resources(conn, ep_rdm); if (raw_sbuf) { memcpy(raw_sbuf, buf, len); struct ibv_sge sge = { 0 }; struct ibv_send_wr wr = { 0 }; struct ibv_send_wr *bad_wr = NULL; wr.wr_id = FI_IBV_RDM_PACK_SERVICE_WR(conn); wr.sg_list = &sge; wr.num_sge = 1; wr.wr.rdma.remote_addr = addr; wr.wr.rdma.rkey = (uint32_t)key; wr.send_flags = (len < ep_rdm->max_inline_rc) ? IBV_SEND_INLINE : 0; wr.opcode = IBV_WR_RDMA_WRITE; sge.addr = (uint64_t)raw_sbuf; sge.length = len; sge.lkey = conn->s_mr->lkey; FI_IBV_RDM_TAGGED_INC_SEND_COUNTERS(conn, ep_rdm, wr.send_flags); int ret = ibv_post_send(conn->qp, &wr, &bad_wr); return (ret == 0) ? -FI_SUCCESS : -errno; } } return -FI_EAGAIN; } static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_ibv_rdm_ep_rma_read, .readv = fi_no_rma_readv, .readmsg = fi_no_rma_readmsg, .write = fi_ibv_rdm_ep_rma_write, .writev = fi_no_rma_writev, .writemsg = fi_no_rma_writemsg, .inject = fi_ibv_rdm_ep_rma_inject_write, .writedata = fi_no_rma_writedata, .injectdata = fi_no_rma_injectdata, }; struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma(struct fi_ibv_rdm_ep *ep) { return &fi_ibv_rdm_ep_rma_ops; }
static ssize_t fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) src_addr; void *raw_buf = NULL; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (raw_buf) { desc = (void*)(uintptr_t)conn->rma_mr->lkey; again = 0; } } if (again) { goto out_again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || RMA_RESOURCES_IS_BUSY(conn, ep)) { /* * TODO: Should be postponed queue flow for RMA be implemented? */ goto out_again; } struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->rmabuf = raw_buf; struct fi_ibv_rdm_rma_start_data start_data = { .ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid), .conn = (struct fi_ibv_rdm_tagged_conn *) src_addr, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_READ }; fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY, &post_ready_data); out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { if(msg->iov_count == 1 && msg->rma_iov_count == 1) { return fi_ibv_rdm_ep_rma_read(ep, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, msg->desc[0], msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key, msg->context); } assert(0); return -FI_EMSGSIZE; } static ssize_t fi_ibv_rdm_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = src_addr, .len = 0, .key = key }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; return fi_ibv_rdm_ep_rma_readmsg(ep, &msg, 0); } static ssize_t fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; void *raw_buf = NULL; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (raw_buf) { memcpy (raw_buf, buf, len); desc = (void*)(uintptr_t)conn->rma_mr->lkey; again = 0; } } if (again) { goto out_again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || SEND_RESOURCES_IS_BUSY(conn, ep)) { /* * TODO: Should be postponed queue flow for RMA be implemented? */ goto out_again; } struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->rmabuf = raw_buf; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_WRITE }; fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY, &post_ready_data); ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER; out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { if(msg->iov_count == 1 && msg->rma_iov_count == 1) { return fi_ibv_rdm_ep_rma_write(ep, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, msg->desc[0], msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key, msg->context); } assert(0); return -FI_EMSGSIZE; } static ssize_t fi_ibv_rdm_ep_rma_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = dest_addr, .len = 0, .key = key }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; return fi_ibv_rdm_ep_rma_writemsg(ep, &msg, 0); } static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; struct fi_ibv_rdm_tagged_request *request = NULL; int ret = FI_EP_RDM_HNDL_AGAIN; if (len >= ep_rdm->rndv_threshold) { return -FI_EMSGSIZE; } if (fi_ibv_rdm_check_connection(conn, ep_rdm) && !RMA_RESOURCES_IS_BUSY(conn, ep_rdm) && !conn->postponed_entry) { request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_RMA_INJECT; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep_rdm, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = 0 }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); } switch (ret) { case FI_EP_RDM_HNDL_SUCCESS: return ret; case FI_EP_RDM_HNDL_AGAIN: ret = -FI_EAGAIN; break; default: ret = -errno; break; } if (request) { FI_IBV_RDM_TAGGED_DBG_REQUEST("to_pool: ", request, FI_LOG_DEBUG); util_buf_release(fi_ibv_rdm_tagged_request_pool, request); } fi_ibv_rdm_tagged_poll(ep_rdm); return ret; } static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_ibv_rdm_ep_rma_read, .readv = fi_ibv_rdm_ep_rma_readv, .readmsg = fi_ibv_rdm_ep_rma_readmsg, .write = fi_ibv_rdm_ep_rma_write, .writev = fi_ibv_rdm_ep_rma_writev, .writemsg = fi_ibv_rdm_ep_rma_writemsg, .inject = fi_ibv_rdm_ep_rma_inject_write, .writedata = fi_no_rma_writedata, .injectdata = fi_no_rma_injectdata, }; struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma(struct fi_ibv_rdm_ep *ep) { return &fi_ibv_rdm_ep_rma_ops; }
.peek_data = { .minfo = { .conn = conn, .tag = tag, .tagmask = ~ignore }, .context = context, .flags = 0 }, .dest_addr = buf, .data_len = len, .ep = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RECV_START, &recv_data); VERBS_DBG(FI_LOG_EP_DATA, "fi_recvfrom: conn %p, tag 0x%llx, len %d, rbuf %p, fi_ctx %p, posted_recv %d\n", conn, tag, (int)len, buf, context, ep->posted_recvs); if (ret || request->state.eager == FI_IBV_STATE_EAGER_RECV_WAIT4PKT) { goto out; } } struct fi_ibv_recv_got_pkt_process_data data = { .ep = ep