static ucs_status_t ucp_wireup_add_am_lane(ucp_ep_h ep, unsigned address_count, const ucp_address_entry_t *address_list, ucp_wireup_lane_desc_t *lane_descs, ucp_lane_index_t *num_lanes_p) { ucp_wireup_criteria_t criteria; ucp_rsc_index_t rsc_index; ucp_lane_index_t lane; ucs_status_t status; unsigned addr_index; double score; int need_am; /* Check if we need active messages, for wireup */ if (!(ucp_ep_get_context_features(ep) & UCP_FEATURE_TAG)) { need_am = 0; for (lane = 0; lane < *num_lanes_p; ++lane) { need_am = need_am || ucp_worker_is_tl_p2p(ep->worker, lane_descs[lane].rsc_index); } if (!need_am) { return UCS_OK; } } /* Select one lane for active messages */ criteria.title = "active messages"; criteria.local_md_flags = 0; criteria.remote_md_flags = 0; criteria.remote_iface_flags = UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_CB_SYNC; criteria.local_iface_flags = UCT_IFACE_FLAG_AM_BCOPY; criteria.calc_score = ucp_wireup_am_score_func; if (ucs_test_all_flags(ucp_ep_get_context_features(ep), UCP_FEATURE_TAG | UCP_FEATURE_WAKEUP)) { criteria.remote_iface_flags |= UCT_IFACE_FLAG_WAKEUP; } status = ucp_wireup_select_transport(ep, address_list, address_count, &criteria, -1, 1, &rsc_index, &addr_index, &score); if (status != UCS_OK) { return status; } ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, address_list[addr_index].md_index, score, UCP_WIREUP_LANE_USAGE_AM); return UCS_OK; }
static void ucp_wireup_ep_remote_connected(ucp_ep_h ep) { ucp_worker_h worker = ep->worker; ucp_rsc_index_t rsc_index; ucp_lane_index_t lane; ucs_trace("ep %p: remote connected", ep); for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { rsc_index = ucp_ep_get_rsc_index(ep, lane); if (ucp_worker_is_tl_p2p(worker, rsc_index)) { ucp_stub_ep_remote_connected(ep->uct_eps[lane]); } } }
static ucs_status_t ucp_wireup_connect_local(ucp_ep_h ep, const uint8_t *tli, unsigned address_count, const ucp_address_entry_t *address_list) { ucp_worker_h worker = ep->worker; const ucp_address_entry_t *address; ucp_rsc_index_t rsc_index; ucp_lane_index_t lane, amo_index; ucs_status_t status; ucp_md_map_t UCS_V_UNUSED md_map; ucs_trace("ep %p: connect local transports", ep); for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { rsc_index = ucp_ep_get_rsc_index(ep, lane); if (!ucp_worker_is_tl_p2p(worker, rsc_index)) { continue; } address = &address_list[tli[lane]]; ucs_assert(address->tl_addr_len > 0); /* Check that if the lane is used for RMA/AMO, destination md index matches */ md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.rma_lane_map, lane); ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)), "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane], md_map, address->md_index); amo_index = ucp_ep_get_amo_lane_index(&ucp_ep_config(ep)->key, lane); if (amo_index != UCP_NULL_LANE) { md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.amo_lane_map, amo_index); ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)), "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane], md_map, address->md_index); } status = uct_ep_connect_to_ep(ep->uct_eps[lane], address->dev_addr, address->ep_addr); if (status != UCS_OK) { return status; } } return UCS_OK; }
static ucp_lane_index_t ucp_wireup_select_wireup_msg_lane(ucp_worker_h worker, const ucp_address_entry_t *address_list, const ucp_wireup_lane_desc_t *lane_descs, ucp_lane_index_t num_lanes) { ucp_context_h context = worker->context; ucp_lane_index_t p2p_lane = UCP_NULL_RESOURCE; uct_tl_resource_desc_t *resource; ucp_rsc_index_t rsc_index; ucp_lane_index_t lane; unsigned addr_index; for (lane = 0; lane < num_lanes; ++lane) { rsc_index = lane_descs[lane].rsc_index; addr_index = lane_descs[lane].addr_index; resource = &context->tl_rscs[rsc_index].tl_rsc; /* if the current lane satisfies the wireup criteria, choose it for wireup. * if it doesn't take a lane with a p2p transport */ if (ucp_wireup_check_flags(resource, worker->iface_attrs[rsc_index].cap.flags, ucp_wireup_aux_criteria.local_iface_flags, ucp_wireup_aux_criteria.title, ucp_wireup_iface_flags, NULL, 0) && ucp_wireup_check_flags(resource, address_list[addr_index].iface_attr.cap_flags, ucp_wireup_aux_criteria.remote_iface_flags, ucp_wireup_aux_criteria.title, ucp_wireup_iface_flags, NULL, 0)) { return lane; } else if (ucp_worker_is_tl_p2p(worker, rsc_index)) { p2p_lane = lane; } } return p2p_lane; }
ucs_status_t ucp_wireup_send_request(ucp_ep_h ep) { ucp_worker_h worker = ep->worker; ucp_rsc_index_t rsc_tli[UCP_MAX_LANES]; ucp_rsc_index_t rsc_index; uint64_t tl_bitmap = 0; ucp_lane_index_t lane; ucs_status_t status; if (ep->flags & UCP_EP_FLAG_CONNECT_REQ_SENT) { return UCS_OK; } ucs_assert_always(!ucp_ep_is_stub(ep)); for (lane = 0; lane < UCP_MAX_LANES; ++lane) { if (lane < ucp_ep_num_lanes(ep)) { rsc_index = ucp_ep_get_rsc_index(ep, lane); rsc_tli[lane] = ucp_worker_is_tl_p2p(worker, rsc_index) ? rsc_index : UCP_NULL_RESOURCE; tl_bitmap |= UCS_BIT(rsc_index); } else { rsc_tli[lane] = UCP_NULL_RESOURCE; } } /* TODO make sure such lane would exist */ rsc_index = ucp_stub_ep_get_aux_rsc_index( ep->uct_eps[ucp_ep_get_wireup_msg_lane(ep)]); if (rsc_index != UCP_NULL_RESOURCE) { tl_bitmap |= UCS_BIT(rsc_index); } ucs_debug("ep %p: send wireup request (flags=0x%x)", ep, ep->flags); status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REQUEST, tl_bitmap, rsc_tli); ep->flags |= UCP_EP_FLAG_CONNECT_REQ_SENT; return status; }
ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned address_count, const ucp_address_entry_t *address_list, uint8_t *addr_indices) { ucp_worker_h worker = ep->worker; ucp_ep_config_key_t key; uint16_t new_cfg_index; ucp_lane_index_t lane; ucs_status_t status; uint8_t conn_flag; char str[32]; ucs_trace("ep %p: initialize lanes", ep); status = ucp_wireup_select_lanes(ep, address_count, address_list, addr_indices, &key); if (status != UCS_OK) { goto err; } key.reachable_md_map |= ucp_ep_config(ep)->key.reachable_md_map; new_cfg_index = ucp_worker_get_ep_config(worker, &key); if ((ep->cfg_index == new_cfg_index)) { return UCS_OK; /* No change */ } if ((ep->cfg_index != 0) && !ucp_ep_is_stub(ep)) { /* * TODO handle a case where we have to change lanes and reconfigure the ep: * * - if we already have uct ep connected to an address - move it to the new lane index * - if we don't yet have connection to an address - create it * - if an existing lane is not connected anymore - delete it (possibly) * - if the configuration has changed - replay all pending operations on all lanes - * need that every pending callback would return, in case of failure, the number * of lane it wants to be queued on. */ ucs_debug("cannot reconfigure ep %p from [%d] to [%d]", ep, ep->cfg_index, new_cfg_index); ucp_wireup_print_config(worker->context, &ucp_ep_config(ep)->key, "old", NULL); ucp_wireup_print_config(worker->context, &key, "new", NULL); ucs_fatal("endpoint reconfiguration not supported yet"); } ep->cfg_index = new_cfg_index; ep->am_lane = key.am_lane; snprintf(str, sizeof(str), "ep %p", ep); ucp_wireup_print_config(worker->context, &ucp_ep_config(ep)->key, str, addr_indices); ucs_trace("ep %p: connect lanes", ep); /* establish connections on all underlying endpoints */ conn_flag = UCP_EP_FLAG_LOCAL_CONNECTED; for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { status = ucp_wireup_connect_lane(ep, lane, address_count, address_list, addr_indices[lane]); if (status != UCS_OK) { goto err; } if (ucp_worker_is_tl_p2p(worker, ucp_ep_get_rsc_index(ep, lane))) { conn_flag = 0; /* If we have a p2p transport, we're not connected */ } } ep->flags |= conn_flag; return UCS_OK; err: for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { if (ep->uct_eps[lane] != NULL) { uct_ep_destroy(ep->uct_eps[lane]); ep->uct_eps[lane] = NULL; } } return status; }
static void ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg, uint64_t uuid, const char *peer_name, unsigned address_count, const ucp_address_entry_t *address_list) { ucp_ep_h ep = ucp_worker_ep_find(worker, uuid); ucp_rsc_index_t rsc_tli[UCP_MAX_LANES]; uint8_t addr_indices[UCP_MAX_LANES]; ucp_lane_index_t lane, remote_lane; ucp_rsc_index_t rsc_index; ucs_status_t status; uint64_t tl_bitmap = 0; ucs_trace("ep %p: got wireup request from %s", ep, peer_name); /* Create a new endpoint if does not exist */ if (ep == NULL) { status = ucp_ep_new(worker, uuid, peer_name, "remote-request", &ep); if (status != UCS_OK) { return; } } /* Initialize lanes (possible destroy existing lanes) */ status = ucp_wireup_init_lanes(ep, address_count, address_list, addr_indices); if (status != UCS_OK) { return; } /* Connect p2p addresses to remote endpoint */ if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) { status = ucp_wireup_connect_local(ep, addr_indices, address_count, address_list); if (status != UCS_OK) { return; } ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED; /* Construct the list that tells the remote side with which address we * have connected to each of its lanes. */ memset(rsc_tli, -1, sizeof(rsc_tli)); for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { rsc_index = ucp_ep_get_rsc_index(ep, lane); for (remote_lane = 0; remote_lane < UCP_MAX_LANES; ++remote_lane) { /* If 'lane' has connected to 'remote_lane' ... */ if (addr_indices[lane] == msg->tli[remote_lane]) { ucs_assert(ucp_worker_is_tl_p2p(worker, rsc_index)); rsc_tli[remote_lane] = rsc_index; tl_bitmap |= UCS_BIT(rsc_index); } } } ucs_trace("ep %p: sending wireup reply", ep); status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REPLY, tl_bitmap, rsc_tli); if (status != UCS_OK) { return; } } }
void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) { ucp_context_h context = worker->context; ucp_ep_rma_config_t *rma_config; uct_iface_attr_t *iface_attr; ucp_rsc_index_t rsc_index; uct_md_attr_t *md_attr; ucp_lane_index_t lane; double zcopy_thresh, numerator, denumerator; size_t rndv_thresh; /* Default settings */ config->zcopy_thresh = SIZE_MAX; config->sync_zcopy_thresh = -1; config->bcopy_thresh = context->config.ext.bcopy_thresh; config->rndv_thresh = SIZE_MAX; config->sync_rndv_thresh = SIZE_MAX; config->max_rndv_get_zcopy = SIZE_MAX; config->p2p_lanes = 0; /* Collect p2p lanes */ for (lane = 0; lane < config->key.num_lanes; ++lane) { rsc_index = config->key.lanes[lane]; if ((rsc_index != UCP_NULL_RESOURCE) && ucp_worker_is_tl_p2p(worker, rsc_index)) { config->p2p_lanes |= UCS_BIT(lane); } } /* Configuration for active messages */ if (config->key.am_lane != UCP_NULL_LANE) { lane = config->key.am_lane; rsc_index = config->key.lanes[lane]; if (rsc_index != UCP_NULL_RESOURCE) { iface_attr = &worker->iface_attrs[rsc_index]; md_attr = &context->md_attrs[context->tl_rscs[rsc_index].md_index]; if (iface_attr->cap.flags & UCT_IFACE_FLAG_AM_SHORT) { config->max_eager_short = iface_attr->cap.am.max_short - sizeof(ucp_eager_hdr_t); config->max_am_short = iface_attr->cap.am.max_short - sizeof(uint64_t); } if (iface_attr->cap.flags & UCT_IFACE_FLAG_AM_BCOPY) { config->max_am_bcopy = iface_attr->cap.am.max_bcopy; } if ((iface_attr->cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) && (md_attr->cap.flags & UCT_MD_FLAG_REG)) { config->max_am_zcopy = iface_attr->cap.am.max_zcopy; if (context->config.ext.zcopy_thresh == UCS_CONFIG_MEMUNITS_AUTO) { /* auto */ zcopy_thresh = md_attr->reg_cost.overhead / ( (1.0 / context->config.ext.bcopy_bw) - (1.0 / iface_attr->bandwidth) - md_attr->reg_cost.growth); if (zcopy_thresh < 0) { config->zcopy_thresh = SIZE_MAX; config->sync_zcopy_thresh = -1; } else { config->zcopy_thresh = zcopy_thresh; config->sync_zcopy_thresh = zcopy_thresh; } } else { config->zcopy_thresh = context->config.ext.zcopy_thresh; config->sync_zcopy_thresh = context->config.ext.zcopy_thresh; } } } else { config->max_am_bcopy = UCP_MIN_BCOPY; /* Stub endpoint */ } } /* Configuration for remote memory access */ for (lane = 0; lane < config->key.num_lanes; ++lane) { if (ucp_lane_map_get_lane(config->key.rma_lane_map, lane) == 0) { continue; } rma_config = &config->rma[lane]; rsc_index = config->key.lanes[lane]; iface_attr = &worker->iface_attrs[rsc_index]; if (rsc_index != UCP_NULL_RESOURCE) { if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_SHORT) { rma_config->max_put_short = iface_attr->cap.put.max_short; } if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_BCOPY) { rma_config->max_put_bcopy = iface_attr->cap.put.max_bcopy; } if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_BCOPY) { rma_config->max_get_bcopy = iface_attr->cap.get.max_bcopy; } } else { rma_config->max_put_bcopy = UCP_MIN_BCOPY; /* Stub endpoint */ } } /* Configuration for Rendezvous data */ if (config->key.rndv_lane != UCP_NULL_LANE) { lane = config->key.rndv_lane; rsc_index = config->key.lanes[lane]; if (rsc_index != UCP_NULL_RESOURCE) { iface_attr = &worker->iface_attrs[rsc_index]; md_attr = &context->md_attrs[context->tl_rscs[rsc_index].md_index]; ucs_assert_always(iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY); if (context->config.ext.rndv_thresh == UCS_CONFIG_MEMUNITS_AUTO) { /* auto */ /* Make UCX calculate the rndv threshold on its own. * We do it by finding the message size at which rndv and eager_zcopy get * the same latency. Starting this message size (rndv_thresh), rndv's latency * would be lower and the reached bandwidth would be higher. * The latency function for eager_zcopy is: * [ reg_cost.overhead + size * md_attr->reg_cost.growth + * max(size/bw , size/bcopy_bw) + overhead ] * The latency function for Rendezvous is: * [ reg_cost.overhead + size * md_attr->reg_cost.growth + latency + overhead + * reg_cost.overhead + size * md_attr->reg_cost.growth + overhead + latency + * size/bw + latency + overhead + latency ] * Isolating the 'size' yields the rndv_thresh. * The used latency functions for eager_zcopy and rndv are also specified in * the UCX wiki */ numerator = ((2 * iface_attr->overhead) + (4 * iface_attr->latency) + md_attr->reg_cost.overhead); denumerator = (ucs_max((1.0 / iface_attr->bandwidth),(1.0 / context->config.ext.bcopy_bw)) - md_attr->reg_cost.growth - (1.0 / iface_attr->bandwidth)); if (denumerator > 0) { rndv_thresh = numerator / denumerator; ucs_trace("rendezvous threshold is %zu ( = %f / %f)", rndv_thresh, numerator, denumerator); } else { rndv_thresh = context->config.ext.rndv_thresh_fallback; ucs_trace("rendezvous threshold is %zu", rndv_thresh); } /* for the 'auto' mode in the rndv_threshold, we enforce the usage of rndv * to a value that can be set by the user. * to disable rndv, need to set a high value for the rndv_threshold * (without the 'auto' mode)*/ config->rndv_thresh = rndv_thresh; config->sync_rndv_thresh = rndv_thresh; config->max_rndv_get_zcopy = iface_attr->cap.get.max_zcopy; } else { config->rndv_thresh = context->config.ext.rndv_thresh; config->sync_rndv_thresh = context->config.ext.rndv_thresh; config->max_rndv_get_zcopy = iface_attr->cap.get.max_zcopy; ucs_trace("rendezvous threshold is %zu", config->rndv_thresh); } } else { ucs_debug("rendezvous protocol is not supported "); } } }