static unsigned uct_tcp_ep_send(uct_tcp_ep_t *ep) { uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_tcp_iface_t); size_t send_length; ucs_status_t status; send_length = ep->length - ep->offset; ucs_assert(send_length > 0); status = uct_tcp_send(ep->fd, ep->buf + ep->offset, &send_length); if (status < 0) { return 0; } ucs_trace_data("tcp_ep %p: sent %zu bytes", ep, send_length); iface->outstanding -= send_length; ep->offset += send_length; if (ep->offset == ep->length) { ep->offset = 0; ep->length = 0; } return send_length > 0; }
/* send a signal to remote interface using Unix-domain socket */ static ucs_status_t uct_mm_ep_signal_remote(uct_mm_ep_t *ep, uct_mm_iface_conn_signal_t sig) { uct_mm_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_mm_iface_t); int ret; /** * Send connect message to remote interface */ for (;;) { ret = sendto(iface->signal_fd, &sig, sizeof(sig), 0, (const struct sockaddr*)&ep->cached_signal_sockaddr, ep->cached_signal_addrlen); if (ret >= 0) { ucs_assert(ret == sizeof(sig)); return UCS_OK; } else if (errno == EAGAIN) { /* If sending a signal has failed, retry. * Note the by default the receiver might have a limited backlog, * on Linux systems it is net.unix.max_dgram_qlen (10 by default). */ uct_mm_iface_recv_messages(iface); } else { if ((sig == UCT_MM_IFACE_SIGNAL_DISCONNECT) && (errno == ECONNREFUSED)) { ucs_debug("failed to send disconnect signal: connection refused"); } else { ucs_error("failed to send %sconnect signal: %m", (sig == UCT_MM_IFACE_SIGNAL_CONNECT) ? "" : "dis"); } return UCS_ERR_IO_ERROR; } } }
/** * Register the memory on all PDs, except maybe for alloc_pd. * In case alloc_pd != NULL, alloc_pd_memh will hold the memory key obtained from * allocation. It will be put in the array of keys in the proper index. */ static ucs_status_t ucp_memh_reg_pds(ucp_context_h context, ucp_mem_h memh, uct_mem_h alloc_pd_memh) { uct_mem_h dummy_pd_memh; unsigned uct_memh_count; ucs_status_t status; unsigned pd_index; memh->pd_map = 0; uct_memh_count = 0; /* Register on all transports (except the one we used to allocate) */ for (pd_index = 0; pd_index < context->num_pds; ++pd_index) { if (context->pds[pd_index] == memh->alloc_pd) { /* Add the memory handle we got from allocation */ ucs_assert(memh->alloc_method == UCT_ALLOC_METHOD_PD); memh->pd_map |= UCS_BIT(pd_index); memh->uct[uct_memh_count++] = alloc_pd_memh; } else if (context->pd_attrs[pd_index].cap.flags & UCT_PD_FLAG_REG) { /* If the PD supports registration, register on it as well */ status = uct_pd_mem_reg(context->pds[pd_index], memh->address, memh->length, &memh->uct[uct_memh_count]); if (status != UCS_OK) { ucp_memh_dereg_pds(context, memh, &dummy_pd_memh); return status; } memh->pd_map |= UCS_BIT(pd_index); ++uct_memh_count; } } return UCS_OK; }
ucs_status_t uct_rc_verbs_ep_get_bcopy(uct_ep_h tl_ep, uct_unpack_callback_t unpack_cb, void *arg, size_t length, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t); uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); uct_rc_iface_send_desc_t *desc; struct ibv_send_wr wr; struct ibv_sge sge; UCT_CHECK_LENGTH(length, iface->super.super.config.seg_size, "get_bcopy"); UCT_RC_VERBS_CHECK_RES(iface, ep); UCT_RC_IFACE_GET_TX_DESC(&iface->super, iface->super.tx.mp, desc); ucs_assert(length <= iface->super.super.config.seg_size); desc->super.handler = (comp == NULL) ? uct_rc_ep_get_bcopy_handler_no_completion : uct_rc_ep_get_bcopy_handler; desc->super.unpack_arg = arg; desc->super.user_comp = comp; desc->super.length = length; desc->unpack_cb = unpack_cb; uct_rc_verbs_fill_rdma_wr(&wr, IBV_WR_RDMA_READ, &sge, length, remote_addr, rkey); UCT_TL_EP_STAT_OP(&ep->super.super, GET, BCOPY, length); uct_rc_verbs_ep_post_send_desc(ep, &wr, desc, IBV_SEND_SIGNALED); return UCS_INPROGRESS; }
static ucs_status_t uct_knem_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, void *handle) { ucs_assert(NULL == handle); ucs_free((void *)rkey); return UCS_OK; }
static size_t ucp_tag_pack_eager_first_generic(void *dest, void *arg) { ucp_eager_first_hdr_t *hdr = dest; ucp_request_t *req = arg; size_t max_length, length; ucs_assert(req->send.state.offset == 0); max_length = ucp_ep_config(req->send.ep)->max_am_bcopy - sizeof(*hdr); hdr->super.super.tag = req->send.tag; hdr->total_len = req->send.length; ucs_assert(req->send.length > max_length); length = ucp_request_generic_dt_pack(req, hdr + 1, max_length); return sizeof(*hdr) + length; }
ucs_status_t uct_rc_verbs_ep_fc_ctrl(uct_rc_ep_t *rc_ep) { uct_rc_verbs_iface_t *iface = ucs_derived_of(rc_ep->super.super.iface, uct_rc_verbs_iface_t); uct_rc_verbs_ep_t *ep = ucs_derived_of(rc_ep, uct_rc_verbs_ep_t); uct_rc_hdr_t hdr; struct ibv_send_wr fc_wr; /* Do not check FC WND here to avoid head-to-head deadlock. * Credits grant should be sent regardless of FC wnd state. */ ucs_assert(sizeof(hdr) <= iface->verbs_common.config.max_inline); UCT_RC_CHECK_RES(&iface->super, &ep->super); hdr.am_id = UCT_RC_EP_FC_PURE_GRANT; fc_wr.sg_list = iface->verbs_common.inl_sge; fc_wr.num_sge = 1; fc_wr.opcode = IBV_WR_SEND; fc_wr.next = NULL; iface->verbs_common.inl_sge[0].addr = (uintptr_t)&hdr; iface->verbs_common.inl_sge[0].length = sizeof(hdr); uct_rc_verbs_ep_post_send(iface, ep, &fc_wr, IBV_SEND_INLINE); return UCS_OK; }
static size_t ucp_tag_rndv_pack_rkey(ucp_request_t *sreq, ucp_rndv_rts_hdr_t *rndv_rts_hdr) { ucp_ep_h ep = sreq->send.ep; size_t packed_rkey = 0; ucs_status_t status; ucs_assert(UCP_DT_IS_CONTIG(sreq->send.datatype)); /* Check if the sender needs to register the send buffer - * is its datatype contiguous and does the receive side need it */ if ((ucp_ep_is_rndv_lane_present(ep)) && (ucp_ep_rndv_md_flags(ep) & UCT_MD_FLAG_NEED_RKEY)) { status = ucp_request_send_buffer_reg(sreq, ucp_ep_get_rndv_get_lane(ep)); ucs_assert_always(status == UCS_OK); /* if the send buffer was registered, send the rkey */ UCS_PROFILE_CALL(uct_md_mkey_pack, ucp_ep_md(ep, ucp_ep_get_rndv_get_lane(ep)), sreq->send.state.dt.contig.memh, rndv_rts_hdr + 1); rndv_rts_hdr->flags |= UCP_RNDV_RTS_FLAG_PACKED_RKEY; packed_rkey = ucp_ep_md_attr(ep, ucp_ep_get_rndv_get_lane(ep))->rkey_packed_size; } return packed_rkey; }
static size_t ucp_tag_pack_eager_sync_only_dt(void *dest, void *arg) { ucp_eager_sync_hdr_t *hdr = dest; ucp_request_t *req = arg; size_t length; hdr->super.super.tag = req->send.tag.tag; hdr->req.sender_uuid = req->send.ep->worker->uuid; hdr->req.reqptr = (uintptr_t)req; ucs_assert(req->send.state.dt.offset == 0); length = ucp_dt_pack(req->send.datatype, hdr + 1, req->send.buffer, &req->send.state.dt, req->send.length); ucs_assert(length == req->send.length); return sizeof(*hdr) + length; }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface) { struct mlx5_cqe64 *cqe; uint16_t ci; uct_ib_iface_recv_desc_t *desc; uint32_t len; void *packet; ucs_status_t status; ci = iface->rx.wq.cq_wqe_counter & iface->rx.wq.mask; packet = (void *)ntohll(iface->rx.wq.wqes[ci].addr); ucs_prefetch(packet + UCT_IB_GRH_LEN); desc = (uct_ib_iface_recv_desc_t *)(packet - iface->super.super.config.rx_hdr_offset); cqe = uct_ib_mlx5_get_cqe(&iface->rx.cq, UCT_IB_MLX5_CQE64_SIZE_LOG); if (cqe == NULL) { status = UCS_ERR_NO_PROGRESS; goto out; } uct_ib_mlx5_log_cqe(cqe); ucs_assert(0 == (cqe->op_own & (MLX5_INLINE_SCATTER_32|MLX5_INLINE_SCATTER_64))); ucs_assert(ntohs(cqe->wqe_counter) == iface->rx.wq.cq_wqe_counter); iface->super.rx.available++; iface->rx.wq.cq_wqe_counter++; len = ntohl(cqe->byte_cnt); VALGRIND_MAKE_MEM_DEFINED(packet, len); uct_ud_ep_process_rx(&iface->super, (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)desc); status = UCS_OK; out: if (iface->super.rx.available >= iface->super.config.rx_max_batch) { /* we need to try to post buffers always. Otherwise it is possible * to run out of rx wqes if receiver is slow and there are always * cqe to process */ uct_ud_mlx5_iface_post_recv(iface); } return status; }
static ucs_status_t get_nic_address(uct_ugni_device_t *dev_p) { int alps_addr = -1; int alps_dev_id = -1; int i; char *token, *pmi_env; pmi_env = getenv("PMI_GNI_DEV_ID"); if (NULL == pmi_env) { gni_return_t ugni_rc; ugni_rc = GNI_CdmGetNicAddress(dev_p->device_id, &dev_p->address, &dev_p->cpu_id); if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_CdmGetNicAddress failed, device %d, Error status: %s %d", dev_p->device_id, gni_err_str[ugni_rc], ugni_rc); return UCS_ERR_NO_DEVICE; } CPU_SET(dev_p->cpu_id, &(dev_p->cpu_mask)); ucs_debug("(GNI) NIC address: %d", dev_p->address); } else { while ((token = strtok(pmi_env, ":")) != NULL) { alps_dev_id = atoi(token); if (alps_dev_id == dev_p->device_id) { break; } pmi_env = NULL; } ucs_assert(alps_dev_id != -1); pmi_env = getenv("PMI_GNI_LOC_ADDR"); ucs_assert(NULL != pmi_env); i = 0; while ((token = strtok(pmi_env, ":")) != NULL) { if (i == alps_dev_id) { alps_addr = atoi(token); break; } pmi_env = NULL; ++i; } ucs_assert(alps_addr != -1); dev_p->address = alps_addr; ucs_debug("(PMI) NIC address: %d", dev_p->address); } return UCS_OK; }
static ucs_status_t uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr, const void *buffer, unsigned length) { uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_mlx5_iface_t); struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_inl_data_seg *inl; uct_ud_am_short_hdr_t *am; uct_ud_neth_t *neth; unsigned wqe_size; uct_ud_send_skb_t *skb; /* data a written directly into tx wqe, so it is impossible to use * common ud am code */ UCT_CHECK_AM_ID(id); UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + sizeof(hdr) + length, iface->super.config.max_inline, "am_short"); uct_ud_enter(&iface->super); uct_ud_iface_progress_pending_tx(&iface->super); skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); if (!skb) { uct_ud_leave(&iface->super); return UCS_ERR_NO_RESOURCE; } ctrl = iface->tx.wq.curr; /* Set inline segment which has AM id, AM header, and AM payload */ inl = uct_ib_mlx5_get_next_seg(&iface->tx.wq, ctrl, UCT_UD_MLX5_WQE_SIZE); wqe_size = length + sizeof(*am) + sizeof(*neth); inl->byte_count = htonl(wqe_size | MLX5_INLINE_SEG); /* assume that neth and am header fit into one bb */ ucs_assert(sizeof(*am) + sizeof(*neth) < MLX5_SEND_WQE_BB); neth = (void*)(inl + 1); uct_ud_am_set_neth(neth, &ep->super, id); am = (void*)(neth + 1); am->hdr = hdr; uct_ib_mlx5_inline_copy(am + 1, buffer, length, &iface->tx.wq); wqe_size += UCT_UD_MLX5_WQE_SIZE + sizeof(*inl); UCT_CHECK_LENGTH(wqe_size, UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "am_short"); UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth); uct_ud_mlx5_post_send(iface, ep, ctrl, wqe_size); skb->len = sizeof(*neth) + sizeof(*am); memcpy(skb->neth, neth, skb->len); uct_ud_iface_complete_tx_inl(&iface->super, &ep->super, skb, (char *)skb->neth + skb->len, buffer, length); UCT_TL_EP_STAT_OP(&ep->super.super, AM, SHORT, sizeof(hdr) + length); uct_ud_leave(&iface->super); return UCS_OK; }
/* Pack a string and return a pointer to storage right after the string */ static void* ucp_address_pack_string(const char *s, void *dest) { size_t length = strlen(s); ucs_assert(length <= UINT8_MAX); *(uint8_t*)dest = length; memcpy(dest + 1, s, length); return dest + 1 + length; }
/* send a signal to remote interface using Unix-domain socket */ static ucs_status_t uct_mm_ep_signal_remote(uct_mm_ep_t *ep, uct_mm_iface_conn_signal_t sig) { uct_mm_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_mm_iface_t); int ret; /** * Send connect message to remote interface */ ret = sendto(iface->signal_fd, &sig, sizeof(sig), 0, (const struct sockaddr*)&ep->cached_signal_sockaddr, ep->cached_signal_addrlen); if (ret >= 0) { ucs_assert(ret == sizeof(sig)); ucs_debug("Sent connect from socket %d to %p", iface->signal_fd, (const struct sockaddr*)&ep->cached_signal_sockaddr); if (ep->cbq_elem_on) { uct_mm_ep_remove_slow_path_callback(iface, ep); } /* point the ep->fifo_ctl to the remote fifo */ uct_mm_ep_connected(ep); return UCS_OK; } else if (errno == EAGAIN) { /* If sending a signal has failed, retry. * Note that by default the receiver might have a limited backlog, * on Linux systems it is net.unix.max_dgram_qlen (10 by default). */ ucs_debug("Failed to send connect from socket %d to %p", iface->signal_fd, (const struct sockaddr*)&ep->cached_signal_sockaddr); /* If sending the Connect message failed with EAGAIN, try again later. * Don't keep trying now in a loop since this may cause a deadlock which * prevents the reading of incoming messages which blocks the remote sender. * Add the sending attempt as a callback to a slow progress. */ if ((!ep->cbq_elem_on) && (sig == UCT_MM_IFACE_SIGNAL_CONNECT)) { ep->cbq_elem.cb = uct_mm_ep_signal_remote_slow_path_callback; uct_worker_slowpath_progress_register(iface->super.worker, &ep->cbq_elem); ep->cbq_elem_on = 1; } /* Return UCS_OK in this case even though couldn't send, so that the * calling flow would release the lock and allow the reading of incoming * Connect messages. */ return UCS_OK; } else { if (errno == ECONNREFUSED) { ucs_debug("failed to send connect signal: connection refused"); } else { ucs_error("failed to send connect signal: %m"); } return UCS_ERR_IO_ERROR; } }
static size_t ucp_tag_pack_eager_first_dt(void *dest, void *arg) { ucp_eager_first_hdr_t *hdr = dest; ucp_request_t *req = arg; size_t length; length = ucp_ep_config(req->send.ep)->am.max_bcopy - sizeof(*hdr); hdr->super.super.tag = req->send.tag; hdr->total_len = req->send.length; ucs_debug("pack eager_first paylen %zu", length); ucs_assert(req->send.state.offset == 0); ucs_assert(req->send.length > length); return sizeof(*hdr) + ucp_dt_pack(req->send.datatype, hdr + 1, req->send.buffer, &req->send.state, length); }
static size_t ucp_tag_pack_eager_sync_first_contig(void *dest, void *arg) { ucp_eager_sync_first_hdr_t *hdr = dest; ucp_request_t *req = arg; size_t length; length = ucp_ep_config(req->send.ep)->max_am_bcopy - sizeof(*hdr); hdr->super.super.super.tag = req->send.tag; hdr->super.total_len = req->send.length; hdr->req.sender_uuid = req->send.ep->worker->uuid; hdr->req.reqptr = (uintptr_t)req; ucs_debug("pack eager_sync_first paylen %zu", length); ucs_assert(req->send.state.offset == 0); ucs_assert(req->send.length > length); memcpy(hdr + 1, req->send.buffer, length); return sizeof(*hdr) + length; }
unsigned uct_tcp_ep_progress_rx(uct_tcp_ep_t *ep) { uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_tcp_iface_t); uct_tcp_am_hdr_t *hdr; ucs_status_t status; size_t recv_length; ssize_t remainder; ucs_trace_func("ep=%p", ep); /* Receive next chunk of data */ recv_length = iface->config.buf_size - ep->length; status = uct_tcp_recv(ep->fd, ep->buf + ep->length, &recv_length); if (status != UCS_OK) { if (status == UCS_ERR_CANCELED) { ucs_debug("tcp_ep %p: remote disconnected", ep); uct_tcp_ep_mod_events(ep, 0, EPOLLIN); uct_tcp_ep_destroy(&ep->super.super); } return 0; } ep->length += recv_length; ucs_trace_data("tcp_ep %p: recvd %zu bytes", ep, recv_length); /* Parse received active messages */ while ((remainder = ep->length - ep->offset) >= sizeof(*hdr)) { hdr = ep->buf + ep->offset; if (remainder < sizeof(*hdr) + hdr->length) { break; } /* Full message was received */ ep->offset += sizeof(*hdr) + hdr->length; if (hdr->am_id >= UCT_AM_ID_MAX) { ucs_error("invalid am id: %d", hdr->am_id); continue; } uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_RECV, hdr->am_id, hdr + 1, hdr->length, "RECV fd %d", ep->fd); uct_iface_invoke_am(&iface->super, hdr->am_id, hdr + 1, hdr->length, 0); } /* Move the remaining data to the beginning of the buffer * TODO avoid extra copy on partial receive */ ucs_assert(remainder >= 0); memmove(ep->buf, ep->buf + ep->offset, remainder); ep->offset = 0; ep->length = remainder; return recv_length > 0; }
ucs_status_t ucp_request_check_status(void *request) { ucp_request_t *req = (ucp_request_t*)request - 1; if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { ucs_assert(req->status != UCS_INPROGRESS); return req->status; } return UCS_INPROGRESS; }
/* Unpack a string and return pointer to next storage byte */ static const void* ucp_address_unpack_string(const void *src, char *s, size_t max) { size_t length, avail; ucs_assert(max >= 1); length = *(const uint8_t*)src; avail = ucs_min(length, max - 1); memcpy(s, src + 1, avail); s[avail] = '\0'; return src + length + 1; }
static ucs_status_t uct_ud_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, uint64_t remote_addr, uct_rkey_t rkey) { uct_ud_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t); uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_mlx5_iface_t); struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_inl_data_seg *inl; unsigned wqe_size; uct_ud_put_hdr_t *put_hdr; uct_ud_neth_t *neth; uct_ud_send_skb_t *skb; uct_ud_enter(&iface->super); uct_ud_iface_progress_pending_tx(&iface->super); skb = uct_ud_ep_get_tx_skb(&iface->super, &ep->super); if (!skb) { uct_ud_leave(&iface->super); return UCS_ERR_NO_RESOURCE; } ctrl = iface->tx.wq.curr; /* Set inline segment which has AM id, AM header, and AM payload */ inl = uct_ib_mlx5_get_next_seg(&iface->tx.wq, ctrl, UCT_UD_MLX5_WQE_SIZE); wqe_size = length + sizeof(*put_hdr) + sizeof(*neth); inl->byte_count = htonl(wqe_size | MLX5_INLINE_SEG); /* assume that neth and am header fit into one bb */ ucs_assert(sizeof(*put_hdr) + sizeof(*neth) < MLX5_SEND_WQE_BB); neth = (void*)(inl + 1); uct_ud_neth_init_data(&ep->super, neth); uct_ud_neth_set_type_put(&ep->super, neth); uct_ud_neth_ack_req(&ep->super, neth); put_hdr = (uct_ud_put_hdr_t *)(neth+1); put_hdr->rva = remote_addr; uct_ib_mlx5_inline_copy(put_hdr + 1, buffer, length, &iface->tx.wq); wqe_size += UCT_UD_MLX5_WQE_SIZE + sizeof(*inl); UCT_CHECK_LENGTH(wqe_size, UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "put_short"); UCT_UD_EP_HOOK_CALL_TX(&ep->super, neth); uct_ud_mlx5_post_send(iface, ep, ctrl, wqe_size); skb->len = sizeof(*neth) + sizeof(*put_hdr); memcpy(skb->neth, neth, skb->len); uct_ud_iface_complete_tx_inl(&iface->super, &ep->super, skb, (char *)skb->neth + skb->len, buffer, length); UCT_TL_EP_STAT_OP(&ep->super.super, PUT, SHORT, length); uct_ud_leave(&iface->super); return UCS_OK; }
static UCS_F_ALWAYS_INLINE ucs_status_ptr_t ucp_tag_send_req(ucp_request_t *req, size_t count, const ucp_ep_msg_config_t* msg_config, size_t rndv_rma_thresh, size_t rndv_am_thresh, ucp_send_callback_t cb, const ucp_proto_t *proto) { size_t seg_size = (msg_config->max_bcopy - proto->only_hdr_size); size_t rndv_thresh = ucp_tag_get_rndv_threshold(req, count, msg_config->max_iov, rndv_rma_thresh, rndv_am_thresh, seg_size); size_t zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, count, rndv_thresh); ssize_t max_short = ucp_proto_get_short_max(req, msg_config); ucs_status_t status; ucs_trace_req("select tag request(%p) progress algorithm datatype=%lx " "buffer=%p length=%zu max_short=%zd rndv_thresh=%zu " "zcopy_thresh=%zu", req, req->send.datatype, req->send.buffer, req->send.length, max_short, rndv_thresh, zcopy_thresh); status = ucp_request_send_start(req, max_short, zcopy_thresh, seg_size, rndv_thresh, proto); if (ucs_unlikely(status != UCS_OK)) { if (status == UCS_ERR_NO_PROGRESS) { ucs_assert(req->send.length >= rndv_thresh); /* RMA/AM rendezvous */ status = ucp_tag_send_start_rndv(req); } if (status != UCS_OK) { return UCS_STATUS_PTR(status); } } ucp_request_send_tag_stat(req); /* * Start the request. * If it is completed immediately, release the request and return the status. * Otherwise, return the request. */ status = ucp_request_send(req); if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { ucs_trace_req("releasing send request %p, returning status %s", req, ucs_status_string(status)); ucp_request_put(req); return UCS_STATUS_PTR(status); } ucp_request_set_callback(req, send.cb, cb) ucs_trace_req("returning send request %p", req); return req + 1; }
static size_t ucp_tag_pack_eager_first_dt(void *dest, void *arg) { ucp_eager_first_hdr_t *hdr = dest; ucp_request_t *req = arg; size_t length; ucs_assert(req->send.lane == ucp_ep_get_am_lane(req->send.ep)); length = ucp_ep_get_max_bcopy(req->send.ep, req->send.lane) - sizeof(*hdr); hdr->super.super.tag = req->send.tag.tag; hdr->total_len = req->send.length; hdr->msg_id = req->send.tag.message_id; ucs_assert(req->send.state.dt.offset == 0); ucs_assert(req->send.length > length); return sizeof(*hdr) + ucp_dt_pack(req->send.datatype, hdr + 1, req->send.buffer, &req->send.state.dt, length); }
void ucp_ep_destroy(ucp_ep_h ep) { ucs_debug("destroy ep %p", ep); ucp_wireup_stop(ep); if (ep->state & UCP_EP_STATE_READY_TO_SEND) { ucp_ep_destroy_uct_ep_safe(ep, ep->uct_ep); } else { ucs_assert(ep->uct_ep == NULL); } ucs_free(ep); }
ucs_status_t ucp_stream_recv_request_test(void *request, size_t *length_p) { ucp_request_t *req = (ucp_request_t*)request - 1; ucs_status_t status = ucp_request_check_status(request); if (status != UCS_INPROGRESS) { ucs_assert(req->flags & UCP_REQUEST_FLAG_STREAM_RECV); *length_p = req->recv.stream.length; } return status; }
ucs_status_t ucp_tag_recv_request_test(void *request, ucp_tag_recv_info_t *info) { ucp_request_t *req = (ucp_request_t*)request - 1; ucs_status_t status = ucp_request_check_status(request); if (status != UCS_INPROGRESS) { ucs_assert(req->flags & UCP_REQUEST_FLAG_RECV); *info = req->recv.tag.info; } return status; }
static ucs_status_t uct_perf_test_alloc_mem(ucx_perf_context_t *perf, ucx_perf_params_t *params) { ucs_status_t status; /* TODO use params->alignment */ status = uct_iface_mem_alloc(perf->uct.iface, params->message_size * params->thread_count, 0, "perftest", &perf->uct.send_mem); if (status != UCS_OK) { ucs_error("Failed allocate send buffer: %s", ucs_status_string(status)); goto err; } ucs_assert(perf->uct.send_mem.md == perf->uct.md); perf->send_buffer = perf->uct.send_mem.address; status = uct_iface_mem_alloc(perf->uct.iface, params->message_size * params->thread_count, 0, "perftest", &perf->uct.recv_mem); if (status != UCS_OK) { ucs_error("Failed allocate receive buffer: %s", ucs_status_string(status)); goto err_free_send; } ucs_assert(perf->uct.recv_mem.md == perf->uct.md); perf->recv_buffer = perf->uct.recv_mem.address; perf->offset = 0; ucs_debug("allocated memory. Send buffer %p, Recv buffer %p", perf->send_buffer, perf->recv_buffer); return UCS_OK; err_free_send: uct_iface_mem_free(&perf->uct.send_mem); err: return status; }
static inline ucs_status_t uct_knem_rma(uct_ep_h tl_ep, const void *buffer, size_t length, uint64_t remote_addr, uct_knem_key_t *key, int write) { struct knem_cmd_inline_copy icopy; struct knem_cmd_param_iovec knem_iov[1]; uct_knem_iface_t *knem_iface = ucs_derived_of(tl_ep->iface, uct_knem_iface_t); int knem_fd = knem_iface->knem_pd->knem_fd; int rc; UCT_KNEM_ZERO_LENGTH_POST(length); knem_iov[0].base = (uintptr_t)buffer; knem_iov[0].len = length; icopy.local_iovec_array = (uintptr_t) &knem_iov[0]; icopy.local_iovec_nr = 1; icopy.remote_cookie = key->cookie; ucs_assert(remote_addr >= key->address); icopy.remote_offset = remote_addr - key->address; icopy.write = write; /* if 0 then, READ from the remote region into my local segments * if 1 then, WRITE to the remote region from my local segment */ icopy.flags = 0; /* TBD: add check and support for KNEM_FLAG_DMA */ icopy.current_status = 0; icopy.async_status_index = 0; icopy.pad = 0; ucs_assert(knem_fd > -1); rc = ioctl(knem_fd, KNEM_CMD_INLINE_COPY, &icopy); if (rc < 0) { ucs_error("KNEM inline copy failed, err = %d %m", rc); return UCS_ERR_IO_ERROR; } uct_knem_trace_data(remote_addr, (uintptr_t)key, "%s [length %zu]", write?"PUT_ZCOPY":"GET_ZCOPY", length); return UCS_OK; }
ucs_status_t uct_tcp_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_tcp_ep_t *ep = ucs_derived_of(tl_ep, uct_tcp_ep_t); if (!uct_tcp_ep_can_send(ep)) { return UCS_ERR_NO_RESOURCE; } ucs_assert(ucs_queue_is_empty(&ep->pending_q)); UCT_TL_EP_STAT_FLUSH(&ep->super); return UCS_OK; }
static int ucp_tag_rndv_is_get_op_possible(ucp_ep_h ep, uct_rkey_t rkey) { uint64_t md_flags; ucs_assert(!ucp_ep_is_stub(ep)); if (ucp_ep_is_rndv_lane_present(ep)) { md_flags = ucp_ep_rndv_md_flags(ep); return (((rkey != UCT_INVALID_RKEY) && (md_flags & UCT_MD_FLAG_REG)) || !(md_flags & UCT_MD_FLAG_NEED_RKEY)); } else { return 0; } }
static void ucp_ep_flush_completion(uct_completion_t *self, ucs_status_t status) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct_comp); ucs_assert(!(req->flags & UCP_REQUEST_FLAG_COMPLETED)); if (status == UCS_OK) { req->status = status; } ucp_ep_flush_progress(req); ucp_flush_check_completion(req); }