ucs_status_t ucp_ep_new(ucp_worker_h worker, uint64_t dest_uuid, const char *peer_name, const char *message, ucp_ep_h *ep_p) { ucp_ep_h ep; ep = ucs_calloc(1, sizeof(*ep), "ucp ep"); if (ep == NULL) { ucs_error("Failed to allocate ep"); return UCS_ERR_NO_MEMORY; } ep->worker = worker; ep->uct_ep = NULL; ep->config.max_short_egr = SIZE_MAX; ep->config.max_bcopy_egr = SIZE_MAX; ep->config.max_short_put = SIZE_MAX; ep->config.max_bcopy_put = SIZE_MAX; ep->config.max_bcopy_get = SIZE_MAX; ep->dest_uuid = dest_uuid; ep->rsc_index = -1; ep->dst_pd_index = -1; ep->state = 0; sglib_hashed_ucp_ep_t_add(worker->ep_hash, ep); #if ENABLE_DEBUG_DATA ucs_snprintf_zero(ep->peer_name, UCP_PEER_NAME_MAX, "%s", peer_name); #endif ucs_debug("created ep %p to %s 0x%"PRIx64"->0x%"PRIx64" %s", ep, ucp_ep_peer_name(ep), worker->uuid, ep->dest_uuid, message); *ep_p = ep; return UCS_OK; }
ucs_status_ptr_t ucp_tag_send_sync_nb(ucp_ep_h ep, const void *buffer, size_t count, ucp_datatype_t datatype, ucp_tag_t tag, ucp_send_callback_t cb) { ucp_request_t *req; ucs_trace_req("send_sync_nb buffer %p count %zu tag %"PRIx64" to %s cb %p", buffer, count, tag, ucp_ep_peer_name(ep), cb); req = ucp_request_get(ep->worker); if (req == NULL) { return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); } UCS_INSTRUMENT_RECORD(UCS_INSTRUMENT_TYPE_UCP_TX, "ucp_tag_send_sync_nb", req, ucp_dt_length(datatype, count, buffer, &req->send.state)); /* Remote side needs to send reply, so have it connect to us */ ucp_ep_connect_remote(ep); ucp_tag_send_req_init(req, ep, buffer, datatype, tag); return ucp_tag_send_req(req, count, -1, /* disable short method */ ucp_ep_config(ep)->sync_zcopy_thresh, ucp_ep_config(ep)->sync_rndv_thresh, cb, &ucp_tag_eager_sync_proto); }
ucs_status_ptr_t ucp_tag_send_nb(ucp_ep_h ep, const void *buffer, size_t count, uintptr_t datatype, ucp_tag_t tag, ucp_send_callback_t cb) { ucs_status_t status; ucs_trace_req("send_nb buffer %p count %zu tag %"PRIx64" to %s cb %p", buffer, count, tag, ucp_ep_peer_name(ep), cb); status = ucp_tag_send_try(ep, buffer, count, datatype, tag); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { return UCS_STATUS_PTR(status); /* UCS_OK also goes here */ } return ucp_tag_send_slow(ep, buffer, count, datatype, tag, cb); }
void ucp_ep_print_info(ucp_ep_h ep, FILE *stream) { fprintf(stream, "#\n"); fprintf(stream, "# UCP endpoint\n"); fprintf(stream, "#\n"); fprintf(stream, "# peer: %s%suuid 0x%"PRIx64"\n", #if ENABLE_DEBUG_DATA ucp_ep_peer_name(ep), ", ", #else "", "", #endif ep->dest_uuid); ucp_ep_config_print(stream, ep->worker, ucp_ep_config(ep), NULL); fprintf(stream, "#\n"); }
ucs_status_ptr_t ucp_tag_send_nb(ucp_ep_h ep, const void *buffer, size_t count, uintptr_t datatype, ucp_tag_t tag, ucp_send_callback_t cb) { ucs_status_t status; ucp_request_t *req; size_t length; ucs_trace_req("send_nb buffer %p count %zu tag %"PRIx64" to %s cb %p", buffer, count, tag, ucp_ep_peer_name(ep), cb); if (ucs_likely((datatype & UCP_DATATYPE_CLASS_MASK) == UCP_DATATYPE_CONTIG)) { length = ucp_contig_dt_length(datatype, count); UCS_INSTRUMENT_RECORD(UCS_INSTRUMENT_TYPE_UCP_TX, "ucp_tag_send_nb (eager - start)", buffer, length); if (ucs_likely(length <= ucp_ep_config(ep)->max_eager_short)) { status = ucp_tag_send_eager_short(ep, tag, buffer, length); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { UCS_INSTRUMENT_RECORD(UCS_INSTRUMENT_TYPE_UCP_TX, "ucp_tag_send_nb (eager - finish)", buffer, length); return UCS_STATUS_PTR(status); /* UCS_OK also goes here */ } } } req = ucp_request_get(ep->worker); if (req == NULL) { return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); } UCS_INSTRUMENT_RECORD(UCS_INSTRUMENT_TYPE_UCP_TX, "ucp_tag_send_nb", req, ucp_dt_length(datatype, count, buffer, &req->send.state)); ucp_tag_send_req_init(req, ep, buffer, datatype, tag); return ucp_tag_send_req(req, count, ucp_ep_config(ep)->max_eager_short, ucp_ep_config(ep)->zcopy_thresh, ucp_ep_config(ep)->rndv_thresh, cb, &ucp_tag_eager_proto); }
ucs_status_ptr_t ucp_tag_send_nb(ucp_ep_h ep, const void *buffer, size_t count, uintptr_t datatype, ucp_tag_t tag, ucp_send_callback_t cb) { ucs_status_t status; ucp_request_t *req; ucs_trace_req("send_nb buffer %p count %zu tag %"PRIx64" to %s", buffer, count, tag, ucp_ep_peer_name(ep)); status = ucp_tag_send_try(ep, buffer, count, datatype, tag); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { return UCS_STATUS_PTR(status); /* UCS_OK also goes here */ } req = ucs_mpool_get(&ep->worker->req_mp); if (req == NULL) { return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); } VALGRIND_MAKE_MEM_DEFINED(req + 1, ep->worker->context->config.request.size); req->flags = 0; req->cb.send = cb; status = ucp_tag_send_start_req(ep, buffer, count, datatype, tag, req); if (status != UCS_OK) { return UCS_STATUS_PTR(status); /* UCS_OK also goes here */ } if (!(req->flags & UCP_REQUEST_FLAG_COMPLETED)) { ucp_ep_add_pending(ep, ep->uct_ep, req); ucp_worker_progress(ep->worker); } ucs_trace_req("send_nb returning request %p", req); return req + 1; }
ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, unsigned address_count, const ucp_address_entry_t *address_list, uint8_t *addr_indices, ucp_ep_config_key_t *key) { ucp_worker_h worker = ep->worker; ucp_lane_index_t num_amo_lanes = 0; ucp_wireup_lane_desc_t lane_descs[UCP_MAX_LANES]; ucp_rsc_index_t rsc_index, dst_md_index; ucp_lane_index_t lane; ucs_status_t status; memset(lane_descs, 0, sizeof(lane_descs)); memset(key, 0, sizeof(*key)); status = ucp_wireup_add_rma_lanes(ep, address_count, address_list, lane_descs, &key->num_lanes); if (status != UCS_OK) { return status; } status = ucp_wireup_add_amo_lanes(ep, address_count, address_list, lane_descs, &key->num_lanes); if (status != UCS_OK) { return status; } status = ucp_wireup_add_am_lane(ep, address_count, address_list, lane_descs, &key->num_lanes); if (status != UCS_OK) { return status; } status = ucp_wireup_add_rndv_lane(ep, address_count, address_list, lane_descs, &key->num_lanes); if (status != UCS_OK) { return status; } /* User should not create endpoints unless requested communication features */ if (key->num_lanes == 0) { ucs_error("No transports selected to %s", ucp_ep_peer_name(ep)); return UCS_ERR_UNREACHABLE; } /* Sort lanes according to RMA score */ qsort(lane_descs, key->num_lanes, sizeof(*lane_descs), ucp_wireup_compare_lane_rma_score); /* Construct the endpoint configuration key: * - arrange lane description in the EP configuration * - create remote MD bitmap * - create bitmap of lanes used for RMA and AMO * - if AM lane exists and fits for wireup messages, select it for this purpose. */ key->am_lane = UCP_NULL_LANE; key->rndv_lane = UCP_NULL_LANE; for (lane = 0; lane < key->num_lanes; ++lane) { rsc_index = lane_descs[lane].rsc_index; dst_md_index = lane_descs[lane].dst_md_index; key->lanes[lane] = rsc_index; addr_indices[lane] = lane_descs[lane].addr_index; ucs_assert(lane_descs[lane].usage != 0); if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_AM) { ucs_assert(key->am_lane == UCP_NULL_LANE); key->am_lane = lane; } if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_RMA) { key->rma_lane_map |= UCS_BIT(dst_md_index + lane * UCP_MD_INDEX_BITS); } if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_AMO) { key->amo_lanes[num_amo_lanes] = lane; ++num_amo_lanes; } if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_RNDV) { ucs_assert(key->rndv_lane == UCP_NULL_LANE); key->rndv_lane = lane; } } /* Sort and add AMO lanes */ ucs_qsort_r(key->amo_lanes, num_amo_lanes, sizeof(*key->amo_lanes), ucp_wireup_compare_lane_amo_score, lane_descs); for (lane = 0; lane < UCP_MAX_LANES; ++lane) { if (lane < num_amo_lanes) { dst_md_index = lane_descs[key->amo_lanes[lane]].dst_md_index; key->amo_lane_map |= UCS_BIT(dst_md_index + lane * UCP_MD_INDEX_BITS); } else { key->amo_lanes[lane] = UCP_NULL_LANE; } } key->reachable_md_map = ucp_wireup_get_reachable_mds(worker, address_count, address_list); key->wireup_msg_lane = ucp_wireup_select_wireup_msg_lane(worker, address_list, lane_descs, key->num_lanes); return UCS_OK; }
/** * Select a local and remote transport */ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(ucp_ep_h ep, const ucp_address_entry_t *address_list, unsigned address_count, const ucp_wireup_criteria_t *criteria, uint64_t remote_md_map, int show_error, ucp_rsc_index_t *rsc_index_p, unsigned *dst_addr_index_p, double *score_p) { ucp_worker_h worker = ep->worker; ucp_context_h context = worker->context; uct_tl_resource_desc_t *resource; const ucp_address_entry_t *ae; ucp_rsc_index_t rsc_index; double score, best_score; char tls_info[256]; char *p, *endp; uct_iface_attr_t *iface_attr; uct_md_attr_t *md_attr; uint64_t addr_index_map; unsigned addr_index; int reachable; int found; found = 0; best_score = 0.0; p = tls_info; endp = tls_info + sizeof(tls_info) - 1; tls_info[0] = '\0'; /* Check which remote addresses satisfy the criteria */ addr_index_map = 0; for (ae = address_list; ae < address_list + address_count; ++ae) { addr_index = ae - address_list; if (!(remote_md_map & UCS_BIT(ae->md_index))) { ucs_trace("addr[%d]: not in use, because on md[%d]", addr_index, ae->md_index); continue; } if (!ucs_test_all_flags(ae->md_flags, criteria->remote_md_flags)) { ucs_trace("addr[%d]: no %s", addr_index, ucp_wireup_get_missing_flag_desc(ae->md_flags, criteria->remote_md_flags, ucp_wireup_md_flags)); continue; } if (!ucs_test_all_flags(ae->iface_attr.cap_flags, criteria->remote_iface_flags)) { ucs_trace("addr[%d]: no %s", addr_index, ucp_wireup_get_missing_flag_desc(ae->iface_attr.cap_flags, criteria->remote_iface_flags, ucp_wireup_iface_flags)); continue; } addr_index_map |= UCS_BIT(addr_index); } /* For each local resource try to find the best remote address to connect to. * Pick the best local resource to satisfy the criteria. * best one has the highest score (from the dedicated score_func) and * has a reachable tl on the remote peer */ for (rsc_index = 0; rsc_index < context->num_tls; ++rsc_index) { resource = &context->tl_rscs[rsc_index].tl_rsc; iface_attr = &worker->iface_attrs[rsc_index]; md_attr = &context->md_attrs[context->tl_rscs[rsc_index].md_index]; /* Check that local md and interface satisfy the criteria */ if (!ucp_wireup_check_flags(resource, md_attr->cap.flags, criteria->local_md_flags, criteria->title, ucp_wireup_md_flags, p, endp - p) || !ucp_wireup_check_flags(resource, iface_attr->cap.flags, criteria->local_iface_flags, criteria->title, ucp_wireup_iface_flags, p, endp - p)) { p += strlen(p); snprintf(p, endp - p, ", "); p += strlen(p); continue; } reachable = 0; for (ae = address_list; ae < address_list + address_count; ++ae) { if (!(addr_index_map & UCS_BIT(ae - address_list)) || !ucp_wireup_is_reachable(worker, rsc_index, ae)) { /* Must be reachable device address, on same transport */ continue; } reachable = 1; score = criteria->calc_score(md_attr, iface_attr, &ae->iface_attr); ucs_assert(score >= 0.0); ucs_trace(UCT_TL_RESOURCE_DESC_FMT "->addr[%zd] : %s score %.2f", UCT_TL_RESOURCE_DESC_ARG(resource), ae - address_list, criteria->title, score); if (!found || (score > best_score)) { *rsc_index_p = rsc_index; *dst_addr_index_p = ae - address_list; *score_p = score; best_score = score; found = 1; } } /* If a local resource cannot reach any of the remote addresses, generate * debug message. */ if (!reachable) { snprintf(p, endp - p, UCT_TL_RESOURCE_DESC_FMT" - cannot reach remote worker, ", UCT_TL_RESOURCE_DESC_ARG(resource)); p += strlen(p); } } if (p >= tls_info + 2) { *(p - 2) = '\0'; /* trim last "," */ } if (!found) { if (show_error) { ucs_error("No %s transport to %s: %s", criteria->title, ucp_ep_peer_name(ep), tls_info); } return UCS_ERR_UNREACHABLE; } ucs_trace("ep %p: selected for %s: " UCT_TL_RESOURCE_DESC_FMT " -> '%s' address[%d],md[%d] score %.2f", ep, criteria->title, UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[*rsc_index_p].tl_rsc), ucp_ep_peer_name(ep), *dst_addr_index_p, address_list[*dst_addr_index_p].md_index, best_score); return UCS_OK; }
} UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nb, (ep, buffer, count, datatype, tag, cb), ucp_ep_h ep, const void *buffer, size_t count, uintptr_t datatype, ucp_tag_t tag, ucp_send_callback_t cb) { ucs_status_t status; ucp_request_t *req; size_t length; ucs_status_ptr_t ret; UCP_THREAD_CS_ENTER_CONDITIONAL(&ep->worker->mt_lock); ucs_trace_req("send_nb buffer %p count %zu tag %"PRIx64" to %s cb %p", buffer, count, tag, ucp_ep_peer_name(ep), cb); if (ucs_likely(UCP_DT_IS_CONTIG(datatype))) { length = ucp_contig_dt_length(datatype, count); if (ucs_likely((ssize_t)length <= ucp_ep_config(ep)->tag.eager.max_short)) { status = UCS_PROFILE_CALL(ucp_tag_send_eager_short, ep, tag, buffer, length); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { UCP_EP_STAT_TAG_OP(ep, EAGER); ret = UCS_STATUS_PTR(status); /* UCS_OK also goes here */ goto out; } } } req = ucp_request_get(ep->worker);