static void ucp_ep_flush_progress(ucp_request_t *req) { ucp_ep_h ep = req->send.ep; ucp_lane_index_t lane; ucs_status_t status; uct_ep_h uct_ep; ucs_trace("ep %p: progress flush req %p, lanes 0x%x count %d", ep, req, req->send.flush.lanes, req->send.uct_comp.count); while (req->send.flush.lanes) { /* Search for next lane to start flush */ lane = ucs_ffs64(req->send.flush.lanes); uct_ep = ep->uct_eps[lane]; if (uct_ep == NULL) { req->send.flush.lanes &= ~UCS_BIT(lane); --req->send.uct_comp.count; continue; } /* Start flush operation on UCT endpoint */ status = uct_ep_flush(uct_ep, 0, &req->send.uct_comp); ucs_trace("flushing ep %p lane[%d]: %s", ep, lane, ucs_status_string(status)); if (status == UCS_OK) { req->send.flush.lanes &= ~UCS_BIT(lane); --req->send.uct_comp.count; } else if (status == UCS_INPROGRESS) { req->send.flush.lanes &= ~UCS_BIT(lane); } else if (status == UCS_ERR_NO_RESOURCE) { if (req->send.lane != UCP_NULL_LANE) { ucs_trace("ep %p: not adding pending flush %p on lane %d, " "because it's already pending on lane %d", ep, req, lane, req->send.lane); break; } req->send.lane = lane; status = uct_ep_pending_add(uct_ep, &req->send.uct); ucs_trace("adding pending flush on ep %p lane[%d]: %s", ep, lane, ucs_status_string(status)); if (status == UCS_OK) { req->send.flush.lanes &= ~UCS_BIT(lane); } else if (status != UCS_ERR_BUSY) { ucp_ep_flush_error(req, status); } } else { ucp_ep_flush_error(req, status); } } }
void mca_spml_ucx_rmkey_unpack(shmem_ctx_t ctx, sshmem_mkey_t *mkey, uint32_t segno, int pe, int tr_id) { spml_ucx_mkey_t *ucx_mkey; mca_spml_ucx_ctx_t *ucx_ctx = (mca_spml_ucx_ctx_t *)ctx; ucs_status_t err; ucx_mkey = &ucx_ctx->ucp_peers[pe].mkeys[segno].key; err = ucp_ep_rkey_unpack(ucx_ctx->ucp_peers[pe].ucp_conn, mkey->u.data, &ucx_mkey->rkey); if (UCS_OK != err) { SPML_UCX_ERROR("failed to unpack rkey: %s", ucs_status_string(err)); goto error_fatal; } if (ucx_ctx == &mca_spml_ucx_ctx_default) { mkey->spml_context = ucx_mkey; } mca_spml_ucx_cache_mkey(ucx_ctx, mkey, segno, pe); return; error_fatal: oshmem_shmem_abort(-1); return; }
void mca_spml_ucx_memuse_hook(void *addr, size_t length) { int my_pe; spml_ucx_mkey_t *ucx_mkey; ucp_mem_advise_params_t params; ucs_status_t status; if (!(mca_spml_ucx.heap_reg_nb && memheap_is_va_in_segment(addr, HEAP_SEG_INDEX))) { return; } my_pe = oshmem_my_proc_id(); ucx_mkey = &mca_spml_ucx_ctx_default.ucp_peers[my_pe].mkeys[HEAP_SEG_INDEX].key; params.field_mask = UCP_MEM_ADVISE_PARAM_FIELD_ADDRESS | UCP_MEM_ADVISE_PARAM_FIELD_LENGTH | UCP_MEM_ADVISE_PARAM_FIELD_ADVICE; params.address = addr; params.length = length; params.advice = UCP_MADV_WILLNEED; status = ucp_mem_advise(mca_spml_ucx.ucp_context, ucx_mkey->mem_h, ¶ms); if (UCS_OK != status) { SPML_UCX_ERROR("ucp_mem_advise failed addr %p len %llu : %s", addr, (unsigned long long)length, ucs_status_string(status)); } }
size_t ucp_tag_rndv_rts_pack(void *dest, void *arg) { ucp_request_t *sreq = arg; /* send request */ ucp_rndv_rts_hdr_t *rndv_rts_hdr = dest; ucp_worker_h worker = sreq->send.ep->worker; ssize_t packed_rkey_size; rndv_rts_hdr->super.tag = sreq->send.tag.tag; rndv_rts_hdr->sreq.reqptr = (uintptr_t)sreq; rndv_rts_hdr->sreq.sender_uuid = worker->uuid; rndv_rts_hdr->size = sreq->send.length; /* Pack remote keys (which can be empty list) */ if (UCP_DT_IS_CONTIG(sreq->send.datatype) && ucp_rndv_is_get_zcopy(sreq, worker->context->config.ext.rndv_mode)) { /* pack rkey, ask target to do get_zcopy */ rndv_rts_hdr->address = (uintptr_t)sreq->send.buffer; packed_rkey_size = ucp_rkey_pack_uct(worker->context, sreq->send.state.dt.dt.contig.md_map, sreq->send.state.dt.dt.contig.memh, rndv_rts_hdr + 1); if (packed_rkey_size < 0) { ucs_fatal("failed to pack rendezvous remote key: %s", ucs_status_string(packed_rkey_size)); } } else { rndv_rts_hdr->address = 0; packed_rkey_size = 0; } return sizeof(*rndv_rts_hdr) + packed_rkey_size; }
static UCS_CLASS_CLEANUP_FUNC(uct_mm_ep_t) { uct_mm_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_mm_iface_t); ucs_status_t status; uct_mm_remote_seg_t *remote_seg; struct sglib_hashed_uct_mm_remote_seg_t_iterator iter; uct_mm_ep_signal_remote(self, UCT_MM_IFACE_SIGNAL_DISCONNECT); uct_worker_progress_unregister(iface->super.worker, uct_mm_iface_progress, iface); for (remote_seg = sglib_hashed_uct_mm_remote_seg_t_it_init(&iter, self->remote_segments_hash); remote_seg != NULL; remote_seg = sglib_hashed_uct_mm_remote_seg_t_it_next(&iter)) { sglib_hashed_uct_mm_remote_seg_t_delete(self->remote_segments_hash, remote_seg); /* detach the remote proceess's descriptors segment */ status = uct_mm_md_mapper_ops(iface->super.md)->detach(remote_seg); if (status != UCS_OK) { ucs_warn("Unable to detach shared memory segment of descriptors: %s", ucs_status_string(status)); } ucs_free(remote_seg); } /* detach the remote proceess's shared memory segment (remote recv FIFO) */ status = uct_mm_md_mapper_ops(iface->super.md)->detach(&self->mapped_desc); if (status != UCS_OK) { ucs_error("error detaching from remote FIFO"); } uct_mm_ep_pending_purge(&self->super.super, NULL, NULL); }
ucs_arbiter_cb_result_t uct_mm_ep_process_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, void *arg) { uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); ucs_status_t status; uct_mm_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_mm_ep_t, arb_group); /* update the local tail with its actual value from the remote peer * making sure that the pending sends would use the real tail value */ ucs_memory_cpu_load_fence(); ep->cached_tail = ep->fifo_ctl->tail; if (!uct_mm_ep_has_tx_resources(ep)) { return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; } status = req->func(req); ucs_trace_data("progress pending request %p returned %s", req, ucs_status_string(status)); if (status == UCS_OK) { /* sent successfully. remove from the arbiter */ return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } else if (status == UCS_INPROGRESS) { /* sent but not completed, keep in the arbiter */ return UCS_ARBITER_CB_RESULT_NEXT_GROUP; } else { /* couldn't send. keep this request in the arbiter until the next time * this function is called */ return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; } }
int mca_pml_ucx_recv(void *buf, size_t count, ompi_datatype_t *datatype, int src, int tag, struct ompi_communicator_t* comm, ompi_status_public_t* mpi_status) { ucp_tag_t ucp_tag, ucp_tag_mask; ompi_request_t *req; PML_UCX_TRACE_RECV("%s", buf, count, datatype, src, tag, comm, "recv"); PML_UCX_MAKE_RECV_TAG(ucp_tag, ucp_tag_mask, tag, src, comm); req = (ompi_request_t*)ucp_tag_recv_nb(ompi_pml_ucx.ucp_worker, buf, count, mca_pml_ucx_get_datatype(datatype), ucp_tag, ucp_tag_mask, mca_pml_ucx_blocking_recv_completion); if (UCS_PTR_IS_ERR(req)) { PML_UCX_ERROR("ucx recv failed: %s", ucs_status_string(UCS_PTR_STATUS(req))); return OMPI_ERROR; } ucp_worker_progress(ompi_pml_ucx.ucp_worker); while ( !REQUEST_COMPLETE(req) ) { opal_progress(); } if (mpi_status != MPI_STATUS_IGNORE) { *mpi_status = req->req_status; } req->req_complete = REQUEST_PENDING; ucp_request_release(req); return OMPI_SUCCESS; }
ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_ep_h ep = req->send.ep; ssize_t packed_len; if (req->send.wireup.type == UCP_WIREUP_MSG_REQUEST) { if (ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED) { ucs_trace("ep %p: not sending wireup message - remote already connected", ep); goto out; } } /* send the active message */ if (req->send.wireup.type == UCP_WIREUP_MSG_ACK) { req->send.lane = ucp_ep_get_am_lane(ep); } else { req->send.lane = ucp_ep_get_wireup_msg_lane(ep); } packed_len = uct_ep_am_bcopy(ep->uct_eps[req->send.lane], UCP_AM_ID_WIREUP, ucp_wireup_msg_pack, req); if (packed_len < 0) { if (packed_len != UCS_ERR_NO_RESOURCE) { ucs_error("failed to send wireup: %s", ucs_status_string(packed_len)); } return (ucs_status_t)packed_len; } out: ucp_request_complete_send(req, UCS_OK); return UCS_OK; }
int mca_pml_ucx_send(const void *buf, size_t count, ompi_datatype_t *datatype, int dst, int tag, mca_pml_base_send_mode_t mode, struct ompi_communicator_t* comm) { ompi_request_t *req; ucp_ep_h ep; PML_UCX_TRACE_SEND("%s", buf, count, datatype, dst, tag, mode, comm, "send"); /* TODO special care to sync/buffered send */ ep = mca_pml_ucx_get_ep(comm, dst); if (OPAL_UNLIKELY(NULL == ep)) { PML_UCX_ERROR("Failed to get ep for rank %d", dst); return OMPI_ERROR; } req = (ompi_request_t*)ucp_tag_send_nb(ep, buf, count, mca_pml_ucx_get_datatype(datatype), PML_UCX_MAKE_SEND_TAG(tag, comm), mca_pml_ucx_send_completion); if (OPAL_LIKELY(req == NULL)) { return OMPI_SUCCESS; } else if (!UCS_PTR_IS_ERR(req)) { PML_UCX_VERBOSE(8, "got request %p", (void*)req); ucp_worker_progress(ompi_pml_ucx.ucp_worker); ompi_request_wait(&req, MPI_STATUS_IGNORE); return OMPI_SUCCESS; } else { PML_UCX_ERROR("ucx send failed: %s", ucs_status_string(UCS_PTR_STATUS(req))); return OMPI_ERROR; } }
ucs_arbiter_cb_result_t uct_ugni_ep_process_pending(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, void *arg){ uct_ugni_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_ugni_ep_t, arb_group); uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); ucs_status_t rc; ep->arb_sched = 1; ucs_trace_data("progressing pending request %p", req); rc = req->func(req); ep->arb_sched = 0; ucs_trace_data("status returned from progress pending: %s", ucs_status_string(rc)); if (UCS_OK == rc) { /* sent successfully. remove from the arbiter */ return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } else if (UCS_INPROGRESS == rc) { return UCS_ARBITER_CB_RESULT_NEXT_GROUP; } else { /* couldn't send. keep this request in the arbiter until the next time * this function is called */ return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; } }
static ucs_status_t ucs_async_handler_dispatch(ucs_async_handler_t *handler) { ucs_async_context_t *async; ucs_async_mode_t mode; ucs_status_t status; mode = handler->mode; async = handler->async; if (async != NULL) { async->last_wakeup = ucs_get_time(); } if (async == NULL) { ucs_trace_async("calling async handler " UCS_ASYNC_HANDLER_FMT, UCS_ASYNC_HANDLER_ARG(handler)); handler->cb(handler->id, handler->arg); } else if (ucs_async_method_call(mode, context_try_block, async)) { ucs_trace_async("calling async handler " UCS_ASYNC_HANDLER_FMT, UCS_ASYNC_HANDLER_ARG(handler)); handler->cb(handler->id, handler->arg); ucs_async_method_call(mode, context_unblock, async); } else /* async != NULL */ { ucs_trace_async("missed " UCS_ASYNC_HANDLER_FMT ", last_wakeup %lu", UCS_ASYNC_HANDLER_ARG(handler), async->last_wakeup); if (ucs_atomic_cswap32(&handler->missed, 0, 1) == 0) { status = ucs_mpmc_queue_push(&async->missed, handler->id); if (status != UCS_OK) { ucs_fatal("Failed to push event %d to miss queue: %s", handler->id, ucs_status_string(status)); } } return UCS_ERR_NO_PROGRESS; } return UCS_OK; }
static ucs_status_t ucp_perf_setup(ucx_perf_context_t *perf, ucx_perf_params_t *params) { ucp_params_t ucp_params; ucp_config_t *config; ucs_status_t status; uint64_t features; status = ucp_perf_test_check_params(params, &features); if (status != UCS_OK) { goto err; } status = ucp_config_read(NULL, NULL, &config); if (status != UCS_OK) { goto err; } ucp_params.features = features; ucp_params.request_size = 0; ucp_params.request_init = NULL; ucp_params.request_cleanup = NULL; status = ucp_init(&ucp_params, config, &perf->ucp.context); ucp_config_release(config); if (status != UCS_OK) { goto err; } status = ucp_worker_create(perf->ucp.context, params->thread_mode, &perf->ucp.worker); if (status != UCS_OK) { goto err_cleanup; } status = ucp_perf_test_alloc_mem(perf, params); if (status != UCS_OK) { ucs_warn("ucp test failed to alocate memory"); goto err_destroy_worker; } status = ucp_perf_test_setup_endpoints(perf, features); if (status != UCS_OK) { if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) { ucs_error("Failed to setup endpoints: %s", ucs_status_string(status)); } goto err_free_mem; } return UCS_OK; err_free_mem: ucp_perf_test_free_mem(perf); err_destroy_worker: ucp_worker_destroy(perf->ucp.worker); err_cleanup: ucp_cleanup(perf->ucp.context); err: return status; }
ucs_status_t ucp_ep_create(ucp_worker_h worker, const ucp_address_t *address, ucp_ep_h *ep_p) { char peer_name[UCP_WORKER_NAME_MAX]; uint8_t addr_indices[UCP_MAX_LANES]; ucp_address_entry_t *address_list; unsigned address_count; ucs_status_t status; uint64_t dest_uuid; ucp_ep_h ep; UCS_ASYNC_BLOCK(&worker->async); status = ucp_address_unpack(address, &dest_uuid, peer_name, sizeof(peer_name), &address_count, &address_list); if (status != UCS_OK) { ucs_error("failed to unpack remote address: %s", ucs_status_string(status)); goto out; } ep = ucp_worker_ep_find(worker, dest_uuid); if (ep != NULL) { /* TODO handle a case where the existing endpoint is incomplete */ *ep_p = ep; status = UCS_OK; goto out_free_address; } /* allocate endpoint */ status = ucp_ep_new(worker, dest_uuid, peer_name, "from api call", &ep); if (status != UCS_OK) { goto out_free_address; } /* initialize transport endpoints */ status = ucp_wireup_init_lanes(ep, address_count, address_list, addr_indices); if (status != UCS_OK) { goto err_destroy_ep; } /* send initial wireup message */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { status = ucp_wireup_send_request(ep); if (status != UCS_OK) { goto err_destroy_ep; } } *ep_p = ep; goto out_free_address; err_destroy_ep: ucp_ep_destroy(ep); out_free_address: ucs_free(address_list); out: UCS_ASYNC_UNBLOCK(&worker->async); return status; }
int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs) { int my_rank = oshmem_my_proc_id(); size_t num_reqs, max_reqs; void *dreq, **dreqs; ucp_ep_h ep; size_t i, n; oshmem_shmem_barrier(); if (!mca_spml_ucx.ucp_peers) { return OSHMEM_SUCCESS; } max_reqs = mca_spml_ucx.num_disconnect; if (max_reqs > nprocs) { max_reqs = nprocs; } dreqs = malloc(sizeof(*dreqs) * max_reqs); if (dreqs == NULL) { return OMPI_ERR_OUT_OF_RESOURCE; } num_reqs = 0; for (i = 0; i < nprocs; ++i) { n = (i + my_rank) % nprocs; ep = mca_spml_ucx.ucp_peers[n].ucp_conn; if (ep == NULL) { continue; } SPML_VERBOSE(10, "disconnecting from peer %d", n); dreq = ucp_disconnect_nb(ep); if (dreq != NULL) { if (UCS_PTR_IS_ERR(dreq)) { SPML_ERROR("ucp_disconnect_nb(%d) failed: %s", n, ucs_status_string(UCS_PTR_STATUS(dreq))); } else { dreqs[num_reqs++] = dreq; } } mca_spml_ucx.ucp_peers[n].ucp_conn = NULL; if ((int)num_reqs >= mca_spml_ucx.num_disconnect) { mca_spml_ucx_waitall(dreqs, &num_reqs); } } mca_spml_ucx_waitall(dreqs, &num_reqs); free(dreqs); opal_pmix.fence(NULL, 0); free(mca_spml_ucx.ucp_peers); return OSHMEM_SUCCESS; }
static ucs_status_t ucp_add_tl_resources(ucp_context_h context, uct_pd_h pd, ucp_rsc_index_t pd_index, const ucp_config_t *config, unsigned *num_resources_p, uint64_t *masks) { uct_tl_resource_desc_t *tl_resources; ucp_tl_resource_desc_t *tmp; unsigned num_resources; ucs_status_t status; ucp_rsc_index_t i; *num_resources_p = 0; /* check what are the available uct resources */ status = uct_pd_query_tl_resources(pd, &tl_resources, &num_resources); if (status != UCS_OK) { ucs_error("Failed to query resources: %s", ucs_status_string(status)); goto err; } if (num_resources == 0) { ucs_debug("No tl resources found for pd %s", context->pd_rscs[pd_index].pd_name); goto out_free_resources; } tmp = ucs_realloc(context->tl_rscs, sizeof(*context->tl_rscs) * (context->num_tls + num_resources), "ucp resources"); if (tmp == NULL) { ucs_error("Failed to allocate resources"); status = UCS_ERR_NO_MEMORY; goto err_free_resources; } /* copy only the resources enabled by user configuration */ context->tl_rscs = tmp; for (i = 0; i < num_resources; ++i) { if (ucp_is_resource_enabled(&tl_resources[i], config, masks)) { context->tl_rscs[context->num_tls].tl_rsc = tl_resources[i]; context->tl_rscs[context->num_tls].pd_index = pd_index; ++context->num_tls; ++(*num_resources_p); } } out_free_resources: uct_release_tl_resource_list(tl_resources); return UCS_OK; err_free_resources: uct_release_tl_resource_list(tl_resources); err: return status; }
static UCS_F_ALWAYS_INLINE ucs_status_ptr_t ucp_tag_send_req(ucp_request_t *req, size_t count, const ucp_ep_msg_config_t* msg_config, size_t rndv_rma_thresh, size_t rndv_am_thresh, ucp_send_callback_t cb, const ucp_proto_t *proto) { size_t seg_size = (msg_config->max_bcopy - proto->only_hdr_size); size_t rndv_thresh = ucp_tag_get_rndv_threshold(req, count, msg_config->max_iov, rndv_rma_thresh, rndv_am_thresh, seg_size); size_t zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, count, rndv_thresh); ssize_t max_short = ucp_proto_get_short_max(req, msg_config); ucs_status_t status; ucs_trace_req("select tag request(%p) progress algorithm datatype=%lx " "buffer=%p length=%zu max_short=%zd rndv_thresh=%zu " "zcopy_thresh=%zu", req, req->send.datatype, req->send.buffer, req->send.length, max_short, rndv_thresh, zcopy_thresh); status = ucp_request_send_start(req, max_short, zcopy_thresh, seg_size, rndv_thresh, proto); if (ucs_unlikely(status != UCS_OK)) { if (status == UCS_ERR_NO_PROGRESS) { ucs_assert(req->send.length >= rndv_thresh); /* RMA/AM rendezvous */ status = ucp_tag_send_start_rndv(req); } if (status != UCS_OK) { return UCS_STATUS_PTR(status); } } ucp_request_send_tag_stat(req); /* * Start the request. * If it is completed immediately, release the request and return the status. * Otherwise, return the request. */ status = ucp_request_send(req); if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { ucs_trace_req("releasing send request %p, returning status %s", req, ucs_status_string(status)); ucp_request_put(req); return UCS_STATUS_PTR(status); } ucp_request_set_callback(req, send.cb, cb) ucs_trace_req("returning send request %p", req); return req + 1; }
ucs_status_t ucp_ep_create(ucp_worker_h worker, const ucp_address_t *address, ucp_ep_h *ep_p) { char peer_name[UCP_WORKER_NAME_MAX]; ucs_status_t status; uint64_t dest_uuid; unsigned address_count; ucp_address_entry_t *address_list; ucp_ep_h ep; UCS_ASYNC_BLOCK(&worker->async); status = ucp_address_unpack(address, &dest_uuid, peer_name, sizeof(peer_name), &address_count, &address_list); if (status != UCS_OK) { ucs_error("failed to unpack remote address: %s", ucs_status_string(status)); goto out; } ep = ucp_worker_ep_find(worker, dest_uuid); if (ep != NULL) { /* TODO handle a case where the existing endpoint is incomplete */ ucs_debug("returning existing ep %p which is already connected to %"PRIx64, ep, ep->dest_uuid); *ep_p = ep; status = UCS_OK; goto out_free_address; } status = ucp_ep_create_connected(worker, dest_uuid, peer_name, address_count, address_list, " from api call", &ep); if (status != UCS_OK) { goto out_free_address; } /* send initial wireup message */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { status = ucp_wireup_send_request(ep); if (status != UCS_OK) { goto err_destroy_ep; } } *ep_p = ep; goto out_free_address; err_destroy_ep: ucp_ep_destroy(ep); out_free_address: ucs_free(address_list); out: UCS_ASYNC_UNBLOCK(&worker->async); return status; }
static ucs_status_t uct_perf_test_alloc_mem(ucx_perf_context_t *perf, ucx_perf_params_t *params) { ucs_status_t status; /* TODO use params->alignment */ status = uct_iface_mem_alloc(perf->uct.iface, params->message_size * params->thread_count, 0, "perftest", &perf->uct.send_mem); if (status != UCS_OK) { ucs_error("Failed allocate send buffer: %s", ucs_status_string(status)); goto err; } ucs_assert(perf->uct.send_mem.md == perf->uct.md); perf->send_buffer = perf->uct.send_mem.address; status = uct_iface_mem_alloc(perf->uct.iface, params->message_size * params->thread_count, 0, "perftest", &perf->uct.recv_mem); if (status != UCS_OK) { ucs_error("Failed allocate receive buffer: %s", ucs_status_string(status)); goto err_free_send; } ucs_assert(perf->uct.recv_mem.md == perf->uct.md); perf->recv_buffer = perf->uct.recv_mem.address; perf->offset = 0; ucs_debug("allocated memory. Send buffer %p, Recv buffer %p", perf->send_buffer, perf->recv_buffer); return UCS_OK; err_free_send: uct_iface_mem_free(&perf->uct.send_mem); err: return status; }
int mca_spml_ucx_fence(shmem_ctx_t ctx) { ucs_status_t err; mca_spml_ucx_ctx_t *ucx_ctx = (mca_spml_ucx_ctx_t *)ctx; err = ucp_worker_fence(ucx_ctx->ucp_worker); if (UCS_OK != err) { SPML_UCX_ERROR("fence failed: %s", ucs_status_string(err)); oshmem_shmem_abort(-1); return OSHMEM_ERROR; } return OSHMEM_SUCCESS; }
static int ucx_perf_thread_spawn(ucx_perf_params_t* params, ucx_perf_result_t* result) { ucx_perf_context_t perf; ucs_status_t status; int ti; int nti = params->thread_count; ucx_perf_thread_context_t* tctx = calloc(nti, sizeof(ucx_perf_thread_context_t)); ucs_status_t* statuses = calloc(nti, sizeof(ucs_status_t)); pthread_barrier_t tbarrier; pthread_barrier_init(&tbarrier, NULL, nti); ucx_perf_test_reset(&perf, params); status = ucx_perf_funcs[params->api].setup(&perf, params); if (UCS_OK != status) { goto out_cleanup; } for (ti = 0; ti < nti; ti++) { tctx[ti].tid = ti; tctx[ti].ntid = nti; tctx[ti].tbarrier = &tbarrier; tctx[ti].statuses = statuses; tctx[ti].params = *params; tctx[ti].perf = perf; /* Doctor the src and dst buffers to make them thread specific */ tctx[ti].perf.send_buffer += ti * params->message_size; tctx[ti].perf.recv_buffer += ti * params->message_size; pthread_create(&tctx[ti].pt, NULL, ucx_perf_thread_run_test, (void*)&tctx[ti]); } for (ti = 0; ti < nti; ti++) { pthread_join(tctx[ti].pt, NULL); if (UCS_OK != statuses[ti]) { ucs_error("Thread %d failed to run test: %s", tctx[ti].tid, ucs_status_string(statuses[ti])); status = statuses[ti]; } } ucx_perf_funcs[params->api].cleanup(&perf); out_cleanup: pthread_barrier_destroy(&tbarrier); free(statuses); free(tctx); return status; }
static ucs_status_ptr_t ucp_disconnect_nb_internal(ucp_ep_h ep) { ucs_status_t status; ucp_request_t *req; ucs_debug("disconnect ep %p", ep); req = ucs_mpool_get(&ep->worker->req_mp); if (req == NULL) { return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); } /* * Flush operation can be queued on the pending queue of only one of the * lanes (indicated by req->send.lane) and scheduled for completion on any * number of lanes. req->send.uct_comp.count keeps track of how many lanes * are not flushed yet, and when it reaches zero, it means all lanes are * flushed. req->send.flush.lanes keeps track of which lanes we still have * to start flush on. * If a flush is completed from a pending/completion callback, we need to * schedule slow-path callback to release the endpoint later, since a UCT * endpoint cannot be released from pending/completion callback context. */ req->flags = 0; req->status = UCS_OK; req->send.ep = ep; req->send.flush.flushed_cb = ucp_ep_disconnected; req->send.flush.lanes = UCS_MASK(ucp_ep_num_lanes(ep)); req->send.flush.cbq_elem.cb = ucp_ep_flushed_slow_path_callback; req->send.flush.cbq_elem_on = 0; req->send.lane = UCP_NULL_LANE; req->send.uct.func = ucp_ep_flush_progress_pending; req->send.uct_comp.func = ucp_ep_flush_completion; req->send.uct_comp.count = ucp_ep_num_lanes(ep); ucp_ep_flush_progress(req); if (req->send.uct_comp.count == 0) { status = req->status; ucp_ep_disconnected(req); ucs_trace_req("ep %p: releasing flush request %p, returning status %s", ep, req, ucs_status_string(status)); ucs_mpool_put(req); return UCS_STATUS_PTR(status); } ucs_trace_req("ep %p: return inprogress flush request %p (%p)", ep, req, req + 1); return req + 1; }
static void mca_pml_ucx_blocking_recv_completion(void *request, ucs_status_t status, ucp_tag_recv_info_t *info) { ompi_request_t *req = request; PML_UCX_VERBOSE(8, "blocking receive request %p completed with status %s tag %"PRIx64" len %zu", (void*)req, ucs_status_string(status), info->sender_tag, info->length); mca_pml_ucx_set_recv_status(&req->req_status, status, info); PML_UCX_ASSERT( !(REQUEST_COMPLETE(req))); ompi_request_complete(req,true); }
static UCS_F_ALWAYS_INLINE void ucp_tag_recv_request_completed(ucp_request_t *req, ucs_status_t status, ucp_tag_recv_info_t *info, const char *function) { ucs_trace_req("%s returning completed request %p (%p) stag 0x%"PRIx64" len %zu, %s", function, req, req + 1, info->sender_tag, info->length, ucs_status_string(status)); req->status = status; if ((req->flags |= UCP_REQUEST_FLAG_COMPLETED) & UCP_REQUEST_FLAG_RELEASED) { ucp_request_put(req); } UCS_PROFILE_REQUEST_EVENT(req, "complete_recv", 0); }
static inline ucs_status_ptr_t ucp_tag_send_req(ucp_request_t *req, size_t count, ssize_t max_short, size_t zcopy_thresh, size_t rndv_thresh, ucp_send_callback_t cb, const ucp_proto_t *proto) { ucs_status_t status; switch (req->send.datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_CONTIG: status = ucp_tag_req_start_contig(req, count, max_short, zcopy_thresh, rndv_thresh, proto); if (status != UCS_OK) { return UCS_STATUS_PTR(status); } break; case UCP_DATATYPE_IOV: status = ucp_tag_req_start_iov(req, count, max_short, zcopy_thresh, rndv_thresh, proto); if (status != UCS_OK) { return UCS_STATUS_PTR(status); } break; case UCP_DATATYPE_GENERIC: ucp_tag_req_start_generic(req, count, rndv_thresh, proto); break; default: ucs_error("Invalid data type"); return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); } /* * Start the request. * If it is completed immediately, release the request and return the status. * Otherwise, return the request. */ status = ucp_request_start_send(req); if (req->flags & UCP_REQUEST_FLAG_COMPLETED) { ucs_trace_req("releasing send request %p, returning status %s", req, ucs_status_string(status)); ucs_mpool_put(req); return UCS_STATUS_PTR(status); } ucs_trace_req("returning send request %p", req); req->send.cb = cb; return req + 1; }
/* * @param [in] rsc_tli Resource index for every lane. */ static ucs_status_t ucp_wireup_msg_send(ucp_ep_h ep, uint8_t type, uint64_t tl_bitmap, const ucp_rsc_index_t *rsc_tli) { ucp_rsc_index_t rsc_index; ucp_lane_index_t lane; unsigned order[UCP_MAX_LANES + 1]; ucp_request_t* req; ucs_status_t status; void *address; ucs_assert(ep->cfg_index != (uint8_t)-1); req = ucs_mpool_get(&ep->worker->req_mp); if (req == NULL) { return UCS_ERR_NO_MEMORY; } req->flags = UCP_REQUEST_FLAG_RELEASED; req->send.ep = ep; req->send.cb = ucp_wireup_msg_send_completion; req->send.wireup.type = type; req->send.uct.func = ucp_wireup_msg_progress; /* pack all addresses */ status = ucp_address_pack(ep->worker, ep, tl_bitmap, order, &req->send.length, &address); if (status != UCS_OK) { ucs_error("failed to pack address: %s", ucs_status_string(status)); return status; } req->send.buffer = address; /* send the indices addresses that should be connected by remote side */ for (lane = 0; lane < UCP_MAX_LANES; ++lane) { rsc_index = rsc_tli[lane]; if (rsc_index != UCP_NULL_RESOURCE) { req->send.wireup.tli[lane] = ucp_wireup_address_index(order, tl_bitmap, rsc_index); } else { req->send.wireup.tli[lane] = -1; } } ucp_request_start_send(req); return UCS_OK; }
/** * dispatch requests waiting for tx resources */ ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_pending_tx(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, void *arg) { uct_dc_mlx5_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_dc_mlx5_ep_t, arb_group); uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); ucs_status_t status; if (!uct_rc_iface_has_tx_resources(&iface->super.super)) { return UCS_ARBITER_CB_RESULT_STOP; } status = req->func(req); ucs_trace_data("progress pending request %p returned: %s", req, ucs_status_string(status)); if (status == UCS_OK) { /* For dcs* policies release dci if this is the last elem in the group * and the dci has no outstanding operations. For example pending * callback did not send anything. (uct_ep_flush or just return ok) */ if (ucs_arbiter_elem_is_last(&ep->arb_group, elem)) { uct_dc_mlx5_iface_dci_free(iface, ep); } return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } if (status == UCS_INPROGRESS) { return UCS_ARBITER_CB_RESULT_NEXT_GROUP; } if (!uct_dc_mlx5_iface_dci_ep_can_send(ep)) { /* Deschedule the group even if FC is the only resource, which * is missing. It will be scheduled again when credits arrive. * We can't desched group with rand policy if non FC resources are * missing, since it's never scheduled again. */ if (uct_dc_mlx5_iface_is_dci_rand(iface) && uct_rc_fc_has_resources(&iface->super.super, &ep->fc)) { return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; } else { return UCS_ARBITER_CB_RESULT_DESCHED_GROUP; } } ucs_assertv(!uct_rc_iface_has_tx_resources(&iface->super.super), "pending callback returned error but send resources are available"); return UCS_ARBITER_CB_RESULT_STOP; }
static int ucx_perf_thread_spawn(ucx_perf_params_t* params, ucx_perf_result_t* result) { ucx_perf_context_t perf; ucs_status_t status = UCS_OK; int ti, nti; omp_set_num_threads(params->thread_count); nti = params->thread_count; ucx_perf_thread_context_t* tctx = calloc(nti, sizeof(ucx_perf_thread_context_t)); ucs_status_t* statuses = calloc(nti, sizeof(ucs_status_t)); ucx_perf_test_reset(&perf, params); status = ucx_perf_funcs[params->api].setup(&perf, params); if (UCS_OK != status) { goto out_cleanup; } #pragma omp parallel private(ti) { ti = omp_get_thread_num(); tctx[ti].tid = ti; tctx[ti].ntid = nti; tctx[ti].statuses = statuses; tctx[ti].params = *params; tctx[ti].perf = perf; /* Doctor the src and dst buffers to make them thread specific */ tctx[ti].perf.send_buffer += ti * params->message_size; tctx[ti].perf.recv_buffer += ti * params->message_size; tctx[ti].perf.offset = ti * params->message_size; ucx_perf_thread_run_test((void*)&tctx[ti]); } for (ti = 0; ti < nti; ti++) { if (UCS_OK != statuses[ti]) { ucs_error("Thread %d failed to run test: %s", tctx[ti].tid, ucs_status_string(statuses[ti])); status = statuses[ti]; } } ucx_perf_funcs[params->api].cleanup(&perf); out_cleanup: free(statuses); free(tctx); return status; }
static UCS_F_ALWAYS_INLINE void ucp_tag_recv_request_completed(ucp_request_t *req, ucs_status_t status, ucp_tag_recv_info_t *info, const char *function) { ucs_trace_req("%s returning completed request %p (%p) stag 0x%"PRIx64" len %zu, %s", function, req, req + 1, info->sender_tag, info->length, ucs_status_string(status)); req->status = status; req->flags |= UCP_REQUEST_FLAG_COMPLETED; if (req->flags & UCP_REQUEST_FLAG_BLOCK_OFFLOAD) { --req->recv.worker->context->tm.sw_req_count; } UCS_PROFILE_REQUEST_EVENT(req, "complete_recv", 0); }
static int ucx_perf_thread_spawn(ucx_perf_context_t *perf, ucx_perf_result_t* result) { ucx_perf_thread_context_t* tctx; ucs_status_t* statuses; size_t message_size; ucs_status_t status; int ti, nti; message_size = ucx_perf_get_message_size(&perf->params); omp_set_num_threads(perf->params.thread_count); nti = perf->params.thread_count; tctx = calloc(nti, sizeof(ucx_perf_thread_context_t)); statuses = calloc(nti, sizeof(ucs_status_t)); if ((tctx == NULL) || (statuses == NULL)) { status = UCS_ERR_NO_MEMORY; goto out_free; } #pragma omp parallel private(ti) { ti = omp_get_thread_num(); tctx[ti].tid = ti; tctx[ti].ntid = nti; tctx[ti].statuses = statuses; tctx[ti].perf = *perf; /* Doctor the src and dst buffers to make them thread specific */ tctx[ti].perf.send_buffer += ti * message_size; tctx[ti].perf.recv_buffer += ti * message_size; tctx[ti].perf.offset = ti * message_size; ucx_perf_thread_run_test((void*)&tctx[ti]); } status = UCS_OK; for (ti = 0; ti < nti; ti++) { if (UCS_OK != statuses[ti]) { ucs_error("Thread %d failed to run test: %s", tctx[ti].tid, ucs_status_string(statuses[ti])); status = statuses[ti]; } } out_free: free(statuses); free(tctx); return status; }
static ucs_status_t ucp_ep_send_wireup_am(ucp_ep_h ep, uct_ep_h uct_ep, uint8_t am_id, ucp_rsc_index_t dst_rsc_index) { ucp_rsc_index_t rsc_index = ep->uct.rsc_index; ucp_worker_h worker = ep->worker; ucp_context_h context = worker->context; uct_iface_attr_t *iface_attr = &worker->iface_attrs[rsc_index]; ucp_wireup_msg_t *msg; ucs_status_t status; size_t msg_len; msg_len = sizeof(*msg) + iface_attr->ep_addr_len; /* TODO use custom pack callback to avoid this allocation and memcopy */ msg = ucs_malloc(msg_len, "conn_req"); if (msg == NULL) { status = UCS_ERR_NO_MEMORY; goto err; } msg->src_uuid = worker->uuid; msg->src_pd_index = context->tl_rscs[rsc_index].pd_index; msg->src_rsc_index = rsc_index; msg->dst_rsc_index = dst_rsc_index; status = uct_ep_get_address(ep->uct.next_ep, (struct sockaddr *)(msg + 1)); if (status != UCS_OK) { goto err_free; } status = uct_ep_am_bcopy(uct_ep, am_id, (uct_pack_callback_t)memcpy, msg, msg_len); if (status != UCS_OK) { ucs_debug("failed to send conn msg: %s%s", ucs_status_string(status), (status == UCS_ERR_NO_RESOURCE) ? ", will retry" : ""); goto err_free; } ucp_wireup_log(worker, am_id, msg, 1); ucs_free(msg); return UCS_OK; err_free: ucs_free(msg); err: return status; }