static ssize_t fi_ibv_rdm_tagged_recvfrom(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { int ret = 0; struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); fi_ibv_rdm_tagged_zero_request(request); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); struct fi_ibv_rdm_tagged_conn *conn = (src_addr == FI_ADDR_UNSPEC) ? NULL : (struct fi_ibv_rdm_tagged_conn *) src_addr; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); { struct fi_ibv_rdm_tagged_recv_start_data recv_data = { .peek_data = { .minfo = { .conn = conn, .tag = tag, .tagmask = ~ignore }, .context = context, .flags = 0 }, .dest_addr = buf, .data_len = len, .ep = ep };
int fi_ibv_rdm_tagged_prepare_send_request( struct fi_ibv_rdm_tagged_request *request, struct fi_ibv_rdm_ep *ep) { #if ENABLE_DEBUG int res = FI_IBV_RDM_TAGGED_SENDS_OUTGOING_ARE_LIMITED(request->minfo.conn, ep); if (res) { FI_IBV_RDM_TAGGED_DBG_REQUEST ("failed because SENDS_OUTGOING_ARE_LIMITED", request, FI_LOG_DEBUG); return !res; } res = PEND_SEND_IS_LIMITED(ep); if (res) { FI_IBV_RDM_TAGGED_DBG_REQUEST ("failed because PEND_SEND_IS_LIMITED", request, FI_LOG_DEBUG); return !res; } #endif // ENABLE_DEBUG request->sbuf = fi_ibv_rdm_prepare_send_resources(request->minfo.conn, ep); return !!request->sbuf; }
static ssize_t fi_ibv_rdm_tagged_ep_cancel(fid_t fid, void *ctx) { struct fi_ibv_rdm_ep *fid_ep; struct fi_context *context = (struct fi_context *)ctx; int err = 1; fid_ep = container_of(fid, struct fi_ibv_rdm_ep, ep_fid); if (!fid_ep->domain) return -EBADF; if (!context) return -EINVAL; if (context->internal[0] == NULL) return 0; struct fi_ibv_rdm_tagged_request *request = context->internal[0]; VERBS_DBG(FI_LOG_EP_DATA, "ep_cancel, match %p, tag 0x%llx, len %d, ctx %p\n", request, (long long unsigned)request->tag, request->len, request->context); struct dlist_entry *found = dlist_find_first_match(&fi_ibv_rdm_tagged_recv_posted_queue, fi_ibv_rdm_tagged_match_requests, request); if (found) { assert(container_of(found, struct fi_ibv_rdm_tagged_request, queue_entry) == request); fi_ibv_rdm_tagged_remove_from_posted_queue(request, fid_ep); assert(request->send_completions_wait == 0); FI_IBV_RDM_TAGGED_DBG_REQUEST("to_pool: ", request, FI_LOG_DEBUG); fi_ibv_mem_pool_return(&request->mpe, &fi_ibv_rdm_tagged_request_pool); VERBS_DBG(FI_LOG_EP_DATA, "\t\t-> SUCCESS, pend recv %d\n", fid_ep->pend_recv); err = 0; } return err; }
static ssize_t fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) src_addr; void *raw_buf = NULL; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (raw_buf) { desc = (void*)(uintptr_t)conn->rma_mr->lkey; again = 0; } } if (again) { goto out_again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || RMA_RESOURCES_IS_BUSY(conn, ep)) { /* * TODO: Should be postponed queue flow for RMA be implemented? */ goto out_again; } struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->rmabuf = raw_buf; struct fi_ibv_rdm_rma_start_data start_data = { .ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid), .conn = (struct fi_ibv_rdm_tagged_conn *) src_addr, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_READ }; fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY, &post_ready_data); out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { if(msg->iov_count == 1 && msg->rma_iov_count == 1) { return fi_ibv_rdm_ep_rma_read(ep, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, msg->desc[0], msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key, msg->context); } assert(0); return -FI_EMSGSIZE; } static ssize_t fi_ibv_rdm_ep_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = src_addr, .len = 0, .key = key }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; return fi_ibv_rdm_ep_rma_readmsg(ep, &msg, 0); } static ssize_t fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; void *raw_buf = NULL; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { raw_buf = fi_ibv_rdm_rma_prepare_resources(conn, ep); if (raw_buf) { memcpy (raw_buf, buf, len); desc = (void*)(uintptr_t)conn->rma_mr->lkey; again = 0; } } if (again) { goto out_again; } } else if (!fi_ibv_rdm_check_connection(conn, ep) || SEND_RESOURCES_IS_BUSY(conn, ep)) { /* * TODO: Should be postponed queue flow for RMA be implemented? */ goto out_again; } struct fi_ibv_rdm_tagged_request *request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; request->rmabuf = raw_buf; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_WRITE }; fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); struct fi_ibv_rma_post_ready_data post_ready_data = { .ep_rdm = ep }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_SEND_READY, &post_ready_data); ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER; out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) { if(msg->iov_count == 1 && msg->rma_iov_count == 1) { return fi_ibv_rdm_ep_rma_write(ep, msg->msg_iov[0].iov_base, msg->msg_iov[0].iov_len, msg->desc[0], msg->addr, msg->rma_iov[0].addr, msg->rma_iov[0].key, msg->context); } assert(0); return -FI_EMSGSIZE; } static ssize_t fi_ibv_rdm_ep_rma_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct fi_rma_iov rma_iov = { .addr = dest_addr, .len = 0, .key = key }; size_t i; for (i = 0; i < count; i++) { rma_iov.len += iov[i].iov_len; } struct fi_msg_rma msg = { .msg_iov = iov, .desc = desc, .iov_count = count, .addr = addr, .rma_iov = &rma_iov, .rma_iov_count = 1, .context = context, .data = 0 }; return fi_ibv_rdm_ep_rma_writemsg(ep, &msg, 0); } static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; struct fi_ibv_rdm_tagged_request *request = NULL; int ret = FI_EP_RDM_HNDL_AGAIN; if (len >= ep_rdm->rndv_threshold) { return -FI_EMSGSIZE; } if (fi_ibv_rdm_check_connection(conn, ep_rdm) && !RMA_RESOURCES_IS_BUSY(conn, ep_rdm) && !conn->postponed_entry) { request = util_buf_alloc(fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_RMA_INJECT; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; struct fi_ibv_rdm_rma_start_data start_data = { .conn = conn, .ep_rdm = ep_rdm, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = 0 }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &start_data); } switch (ret) { case FI_EP_RDM_HNDL_SUCCESS: return ret; case FI_EP_RDM_HNDL_AGAIN: ret = -FI_EAGAIN; break; default: ret = -errno; break; } if (request) { FI_IBV_RDM_TAGGED_DBG_REQUEST("to_pool: ", request, FI_LOG_DEBUG); util_buf_release(fi_ibv_rdm_tagged_request_pool, request); } fi_ibv_rdm_tagged_poll(ep_rdm); return ret; } static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_ibv_rdm_ep_rma_read, .readv = fi_ibv_rdm_ep_rma_readv, .readmsg = fi_ibv_rdm_ep_rma_readmsg, .write = fi_ibv_rdm_ep_rma_write, .writev = fi_ibv_rdm_ep_rma_writev, .writemsg = fi_ibv_rdm_ep_rma_writemsg, .inject = fi_ibv_rdm_ep_rma_inject_write, .writedata = fi_no_rma_writedata, .injectdata = fi_no_rma_injectdata, }; struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma(struct fi_ibv_rdm_ep *ep) { return &fi_ibv_rdm_ep_rma_ops; }
static ssize_t fi_ibv_rdm_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) src_addr; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { void *raw_sbuf = fi_ibv_rdm_tagged_prepare_send_resources(conn, ep); if (raw_sbuf) { memcpy (raw_sbuf, buf, len); buf = raw_sbuf; desc = (void*)(uintptr_t)conn->s_mr->lkey; again = 0; } } if (again) { goto out_again; } } struct fi_ibv_rdm_tagged_request *request = (struct fi_ibv_rdm_tagged_request *) fi_verbs_mem_pool_get(&fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; struct fi_ibv_rdm_rma_start_data data = { .ep_rdm = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid), .conn = (struct fi_ibv_rdm_tagged_conn *) src_addr, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_READ }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &data); ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER; out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { ssize_t ret = FI_SUCCESS; struct fi_ibv_rdm_ep *ep = container_of(ep_fid, struct fi_ibv_rdm_ep, ep_fid); if (desc == NULL && len >= ep->rndv_threshold) { goto out_errinput; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; if (desc == NULL) { int again = 1; if (!conn->postponed_entry) { void *raw_sbuf = fi_ibv_rdm_tagged_prepare_send_resources(conn, ep); if (raw_sbuf) { memcpy (raw_sbuf, buf, len); buf = raw_sbuf; desc = (void*)(uintptr_t)conn->s_mr->lkey; again = 0; } } if (again) { goto out_again; } } struct fi_ibv_rdm_tagged_request *request = (struct fi_ibv_rdm_tagged_request *) fi_verbs_mem_pool_get(&fi_ibv_rdm_tagged_request_pool); FI_IBV_RDM_TAGGED_DBG_REQUEST("get_from_pool: ", request, FI_LOG_DEBUG); /* Initial state */ request->state.eager = FI_IBV_STATE_EAGER_BEGIN; request->state.rndv = FI_IBV_STATE_RNDV_NOT_USED; struct fi_ibv_rdm_rma_start_data data = { .conn = conn, .ep_rdm = ep, .context = context, .data_len = (uint32_t)len, .rbuf = addr, .lbuf = (uintptr_t)buf, .rkey = (uint32_t)key, .lkey = (uint32_t)(uintptr_t)desc, .op_code = IBV_WR_RDMA_WRITE }; ret = fi_ibv_rdm_tagged_req_hndl(request, FI_IBV_EVENT_RMA_START, &data); ret = (ret == FI_EP_RDM_HNDL_SUCCESS) ? FI_SUCCESS : -FI_EOTHER; out: return ret; out_again: fi_ibv_rdm_tagged_poll(ep); ret = -FI_EAGAIN; goto out; out_errinput: ret = -FI_EINVAL; goto out; } static ssize_t fi_ibv_rdm_ep_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct fi_ibv_rdm_ep *ep_rdm = container_of(ep, struct fi_ibv_rdm_ep, ep_fid); if (len >= ep_rdm->rndv_threshold) { return -FI_EMSGSIZE; } struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *) dest_addr; if (!conn->postponed_entry) { void *raw_sbuf = fi_ibv_rdm_tagged_prepare_send_resources(conn, ep_rdm); if (raw_sbuf) { memcpy(raw_sbuf, buf, len); struct ibv_sge sge = { 0 }; struct ibv_send_wr wr = { 0 }; struct ibv_send_wr *bad_wr = NULL; wr.wr_id = FI_IBV_RDM_PACK_SERVICE_WR(conn); wr.sg_list = &sge; wr.num_sge = 1; wr.wr.rdma.remote_addr = addr; wr.wr.rdma.rkey = (uint32_t)key; wr.send_flags = (len < ep_rdm->max_inline_rc) ? IBV_SEND_INLINE : 0; wr.opcode = IBV_WR_RDMA_WRITE; sge.addr = (uint64_t)raw_sbuf; sge.length = len; sge.lkey = conn->s_mr->lkey; FI_IBV_RDM_TAGGED_INC_SEND_COUNTERS(conn, ep_rdm, wr.send_flags); int ret = ibv_post_send(conn->qp, &wr, &bad_wr); return (ret == 0) ? -FI_SUCCESS : -errno; } } return -FI_EAGAIN; } static struct fi_ops_rma fi_ibv_rdm_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_ibv_rdm_ep_rma_read, .readv = fi_no_rma_readv, .readmsg = fi_no_rma_readmsg, .write = fi_ibv_rdm_ep_rma_write, .writev = fi_no_rma_writev, .writemsg = fi_no_rma_writemsg, .inject = fi_ibv_rdm_ep_rma_inject_write, .writedata = fi_no_rma_writedata, .injectdata = fi_no_rma_injectdata, }; struct fi_ops_rma *fi_ibv_rdm_ep_ops_rma(struct fi_ibv_rdm_ep *ep) { return &fi_ibv_rdm_ep_rma_ops; }