Beispiel #1
0
static ucs_status_t ucp_wireup_add_am_lane(ucp_ep_h ep, unsigned address_count,
                                           const ucp_address_entry_t *address_list,
                                           ucp_wireup_lane_desc_t *lane_descs,
                                           ucp_lane_index_t *num_lanes_p)
{
    ucp_wireup_criteria_t criteria;
    ucp_rsc_index_t rsc_index;
    ucp_lane_index_t lane;
    ucs_status_t status;
    unsigned addr_index;
    double score;
    int need_am;

    /* Check if we need active messages, for wireup */
    if (!(ucp_ep_get_context_features(ep) & UCP_FEATURE_TAG)) {
        need_am = 0;
        for (lane = 0; lane < *num_lanes_p; ++lane) {
            need_am = need_am || ucp_worker_is_tl_p2p(ep->worker,
                                                      lane_descs[lane].rsc_index);
        }
        if (!need_am) {
            return UCS_OK;
        }
    }

    /* Select one lane for active messages */
    criteria.title              = "active messages";
    criteria.local_md_flags     = 0;
    criteria.remote_md_flags    = 0;
    criteria.remote_iface_flags = UCT_IFACE_FLAG_AM_BCOPY |
                                  UCT_IFACE_FLAG_AM_CB_SYNC;
    criteria.local_iface_flags  = UCT_IFACE_FLAG_AM_BCOPY;
    criteria.calc_score         = ucp_wireup_am_score_func;

    if (ucs_test_all_flags(ucp_ep_get_context_features(ep), UCP_FEATURE_TAG |
                                                            UCP_FEATURE_WAKEUP)) {
        criteria.remote_iface_flags |= UCT_IFACE_FLAG_WAKEUP;
    }

    status = ucp_wireup_select_transport(ep, address_list, address_count, &criteria,
                                         -1, 1, &rsc_index, &addr_index, &score);
    if (status != UCS_OK) {
        return status;
    }

    ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index,
                             address_list[addr_index].md_index, score,
                             UCP_WIREUP_LANE_USAGE_AM);
    return UCS_OK;
}
Beispiel #2
0
static void ucp_wireup_ep_remote_connected(ucp_ep_h ep)
{
    ucp_worker_h worker = ep->worker;
    ucp_rsc_index_t rsc_index;
    ucp_lane_index_t lane;

    ucs_trace("ep %p: remote connected", ep);

    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
        rsc_index = ucp_ep_get_rsc_index(ep, lane);
        if (ucp_worker_is_tl_p2p(worker, rsc_index)) {
            ucp_stub_ep_remote_connected(ep->uct_eps[lane]);
        }
    }
}
Beispiel #3
0
static ucs_status_t ucp_wireup_connect_local(ucp_ep_h ep, const uint8_t *tli,
                                             unsigned address_count,
                                             const ucp_address_entry_t *address_list)
{
    ucp_worker_h worker = ep->worker;
    const ucp_address_entry_t *address;
    ucp_rsc_index_t rsc_index;
    ucp_lane_index_t lane, amo_index;
    ucs_status_t status;
    ucp_md_map_t UCS_V_UNUSED md_map;

    ucs_trace("ep %p: connect local transports", ep);

    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
        rsc_index = ucp_ep_get_rsc_index(ep, lane);
        if (!ucp_worker_is_tl_p2p(worker, rsc_index)) {
            continue;
        }

        address = &address_list[tli[lane]];
        ucs_assert(address->tl_addr_len > 0);

        /* Check that if the lane is used for RMA/AMO, destination md index matches */
        md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.rma_lane_map, lane);
        ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)),
                    "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane],
                    md_map, address->md_index);

        amo_index = ucp_ep_get_amo_lane_index(&ucp_ep_config(ep)->key, lane);
        if (amo_index != UCP_NULL_LANE) {
            md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.amo_lane_map,
                                           amo_index);
            ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)),
                        "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane],
                        md_map, address->md_index);
        }

        status = uct_ep_connect_to_ep(ep->uct_eps[lane], address->dev_addr,
                                      address->ep_addr);
        if (status != UCS_OK) {
            return status;
        }
    }

    return UCS_OK;
}
Beispiel #4
0
static ucp_lane_index_t
ucp_wireup_select_wireup_msg_lane(ucp_worker_h worker,
                                  const ucp_address_entry_t *address_list,
                                  const ucp_wireup_lane_desc_t *lane_descs,
                                  ucp_lane_index_t num_lanes)
{
    ucp_context_h context     = worker->context;
    ucp_lane_index_t p2p_lane = UCP_NULL_RESOURCE;
    uct_tl_resource_desc_t *resource;
    ucp_rsc_index_t rsc_index;
    ucp_lane_index_t lane;
    unsigned addr_index;

    for (lane = 0; lane < num_lanes; ++lane) {
        rsc_index  = lane_descs[lane].rsc_index;
        addr_index = lane_descs[lane].addr_index;
        resource   = &context->tl_rscs[rsc_index].tl_rsc;

        /* if the current lane satisfies the wireup criteria, choose it for wireup.
         * if it doesn't take a lane with a p2p transport */
        if (ucp_wireup_check_flags(resource,
                                   worker->iface_attrs[rsc_index].cap.flags,
                                   ucp_wireup_aux_criteria.local_iface_flags,
                                   ucp_wireup_aux_criteria.title,
                                   ucp_wireup_iface_flags, NULL, 0) &&
            ucp_wireup_check_flags(resource,
                                   address_list[addr_index].iface_attr.cap_flags,
                                   ucp_wireup_aux_criteria.remote_iface_flags,
                                   ucp_wireup_aux_criteria.title,
                                   ucp_wireup_iface_flags, NULL, 0))
         {
             return lane;
         } else if (ucp_worker_is_tl_p2p(worker, rsc_index)) {
             p2p_lane = lane;
         }
    }

    return p2p_lane;
}
Beispiel #5
0
ucs_status_t ucp_wireup_send_request(ucp_ep_h ep)
{
    ucp_worker_h worker = ep->worker;
    ucp_rsc_index_t rsc_tli[UCP_MAX_LANES];
    ucp_rsc_index_t rsc_index;
    uint64_t tl_bitmap = 0;
    ucp_lane_index_t lane;
    ucs_status_t status;

    if (ep->flags & UCP_EP_FLAG_CONNECT_REQ_SENT) {
        return UCS_OK;
    }

    ucs_assert_always(!ucp_ep_is_stub(ep));

    for (lane = 0; lane < UCP_MAX_LANES; ++lane) {
        if (lane < ucp_ep_num_lanes(ep)) {
            rsc_index = ucp_ep_get_rsc_index(ep, lane);
            rsc_tli[lane] = ucp_worker_is_tl_p2p(worker, rsc_index) ? rsc_index :
                                                                      UCP_NULL_RESOURCE;
            tl_bitmap |= UCS_BIT(rsc_index);
        } else {
            rsc_tli[lane] = UCP_NULL_RESOURCE;
        }
    }

    /* TODO make sure such lane would exist */
    rsc_index = ucp_stub_ep_get_aux_rsc_index(
                    ep->uct_eps[ucp_ep_get_wireup_msg_lane(ep)]);
    if (rsc_index != UCP_NULL_RESOURCE) {
        tl_bitmap |= UCS_BIT(rsc_index);
    }

    ucs_debug("ep %p: send wireup request (flags=0x%x)", ep, ep->flags);
    status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REQUEST, tl_bitmap, rsc_tli);
    ep->flags |= UCP_EP_FLAG_CONNECT_REQ_SENT;
    return status;
}
Beispiel #6
0
ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned address_count,
                                   const ucp_address_entry_t *address_list,
                                   uint8_t *addr_indices)
{
    ucp_worker_h worker = ep->worker;
    ucp_ep_config_key_t key;
    uint16_t new_cfg_index;
    ucp_lane_index_t lane;
    ucs_status_t status;
    uint8_t conn_flag;
    char str[32];

    ucs_trace("ep %p: initialize lanes", ep);

    status = ucp_wireup_select_lanes(ep, address_count, address_list,
                                     addr_indices, &key);
    if (status != UCS_OK) {
        goto err;
    }

    key.reachable_md_map |= ucp_ep_config(ep)->key.reachable_md_map;

    new_cfg_index = ucp_worker_get_ep_config(worker, &key);
    if ((ep->cfg_index == new_cfg_index)) {
        return UCS_OK; /* No change */
    }

    if ((ep->cfg_index != 0) && !ucp_ep_is_stub(ep)) {
        /*
         * TODO handle a case where we have to change lanes and reconfigure the ep:
         *
         * - if we already have uct ep connected to an address - move it to the new lane index
         * - if we don't yet have connection to an address - create it
         * - if an existing lane is not connected anymore - delete it (possibly)
         * - if the configuration has changed - replay all pending operations on all lanes -
         *   need that every pending callback would return, in case of failure, the number
         *   of lane it wants to be queued on.
         */
        ucs_debug("cannot reconfigure ep %p from [%d] to [%d]", ep, ep->cfg_index,
                  new_cfg_index);
        ucp_wireup_print_config(worker->context, &ucp_ep_config(ep)->key, "old", NULL);
        ucp_wireup_print_config(worker->context, &key, "new", NULL);
        ucs_fatal("endpoint reconfiguration not supported yet");
    }

    ep->cfg_index = new_cfg_index;
    ep->am_lane   = key.am_lane;

    snprintf(str, sizeof(str), "ep %p", ep);
    ucp_wireup_print_config(worker->context, &ucp_ep_config(ep)->key, str,
                            addr_indices);

    ucs_trace("ep %p: connect lanes", ep);

    /* establish connections on all underlying endpoints */
    conn_flag = UCP_EP_FLAG_LOCAL_CONNECTED;
    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
        status = ucp_wireup_connect_lane(ep, lane, address_count, address_list,
                                         addr_indices[lane]);
        if (status != UCS_OK) {
            goto err;
        }

        if (ucp_worker_is_tl_p2p(worker, ucp_ep_get_rsc_index(ep, lane))) {
            conn_flag = 0; /* If we have a p2p transport, we're not connected */
        }
    }

    ep->flags |= conn_flag;
    return UCS_OK;

err:
    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
        if (ep->uct_eps[lane] != NULL) {
            uct_ep_destroy(ep->uct_eps[lane]);
            ep->uct_eps[lane] = NULL;
        }
    }
    return status;
}
Beispiel #7
0
static void ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
                                       uint64_t uuid, const char *peer_name,
                                       unsigned address_count,
                                       const ucp_address_entry_t *address_list)
{
    ucp_ep_h ep = ucp_worker_ep_find(worker, uuid);
    ucp_rsc_index_t rsc_tli[UCP_MAX_LANES];
    uint8_t addr_indices[UCP_MAX_LANES];
    ucp_lane_index_t lane, remote_lane;
    ucp_rsc_index_t rsc_index;
    ucs_status_t status;
    uint64_t tl_bitmap = 0;

    ucs_trace("ep %p: got wireup request from %s", ep, peer_name);

    /* Create a new endpoint if does not exist */
    if (ep == NULL) {
        status = ucp_ep_new(worker, uuid, peer_name, "remote-request", &ep);
        if (status != UCS_OK) {
            return;
        }
    }

    /* Initialize lanes (possible destroy existing lanes) */
    status = ucp_wireup_init_lanes(ep, address_count, address_list, addr_indices);
    if (status != UCS_OK) {
        return;
    }

    /* Connect p2p addresses to remote endpoint */
    if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) {
        status = ucp_wireup_connect_local(ep, addr_indices, address_count,
                                          address_list);
        if (status != UCS_OK) {
            return;
        }

        ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED;

        /* Construct the list that tells the remote side with which address we
         * have connected to each of its lanes.
         */
        memset(rsc_tli, -1, sizeof(rsc_tli));
        for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
             rsc_index = ucp_ep_get_rsc_index(ep, lane);
             for (remote_lane = 0; remote_lane < UCP_MAX_LANES; ++remote_lane) {
                 /* If 'lane' has connected to 'remote_lane' ... */
                 if (addr_indices[lane] == msg->tli[remote_lane]) {
                     ucs_assert(ucp_worker_is_tl_p2p(worker, rsc_index));
                     rsc_tli[remote_lane] = rsc_index;
                     tl_bitmap           |= UCS_BIT(rsc_index);
                 }
             }
        }

        ucs_trace("ep %p: sending wireup reply", ep);
        status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REPLY, tl_bitmap, rsc_tli);
        if (status != UCS_OK) {
            return;
        }
    }
}
Beispiel #8
0
void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config)
{
    ucp_context_h context = worker->context;
    ucp_ep_rma_config_t *rma_config;
    uct_iface_attr_t *iface_attr;
    ucp_rsc_index_t rsc_index;
    uct_md_attr_t *md_attr;
    ucp_lane_index_t lane;
    double zcopy_thresh, numerator, denumerator;
    size_t rndv_thresh;

    /* Default settings */
    config->zcopy_thresh          = SIZE_MAX;
    config->sync_zcopy_thresh     = -1;
    config->bcopy_thresh          = context->config.ext.bcopy_thresh;
    config->rndv_thresh           = SIZE_MAX;
    config->sync_rndv_thresh      = SIZE_MAX;
    config->max_rndv_get_zcopy    = SIZE_MAX;
    config->p2p_lanes             = 0;

    /* Collect p2p lanes */
    for (lane = 0; lane < config->key.num_lanes; ++lane) {
        rsc_index   = config->key.lanes[lane];
        if ((rsc_index != UCP_NULL_RESOURCE) &&
            ucp_worker_is_tl_p2p(worker, rsc_index))
        {
            config->p2p_lanes |= UCS_BIT(lane);
        }
    }

    /* Configuration for active messages */
    if (config->key.am_lane != UCP_NULL_LANE) {
        lane        = config->key.am_lane;
        rsc_index   = config->key.lanes[lane];
        if (rsc_index != UCP_NULL_RESOURCE) {
            iface_attr  = &worker->iface_attrs[rsc_index];
            md_attr     = &context->md_attrs[context->tl_rscs[rsc_index].md_index];

            if (iface_attr->cap.flags & UCT_IFACE_FLAG_AM_SHORT) {
                config->max_eager_short  = iface_attr->cap.am.max_short -
                                           sizeof(ucp_eager_hdr_t);
                config->max_am_short     = iface_attr->cap.am.max_short -
                                           sizeof(uint64_t);
            }

            if (iface_attr->cap.flags & UCT_IFACE_FLAG_AM_BCOPY) {
                config->max_am_bcopy     = iface_attr->cap.am.max_bcopy;
            }

            if ((iface_attr->cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) &&
                (md_attr->cap.flags & UCT_MD_FLAG_REG))
            {
                config->max_am_zcopy  = iface_attr->cap.am.max_zcopy;

                if (context->config.ext.zcopy_thresh == UCS_CONFIG_MEMUNITS_AUTO) {
                    /* auto */
                    zcopy_thresh = md_attr->reg_cost.overhead / (
                                            (1.0 / context->config.ext.bcopy_bw) -
                                            (1.0 / iface_attr->bandwidth) -
                                            md_attr->reg_cost.growth);
                    if (zcopy_thresh < 0) {
                        config->zcopy_thresh      = SIZE_MAX;
                        config->sync_zcopy_thresh = -1;
                    } else {
                        config->zcopy_thresh      = zcopy_thresh;
                        config->sync_zcopy_thresh = zcopy_thresh;
                    }
                } else {
                    config->zcopy_thresh      = context->config.ext.zcopy_thresh;
                    config->sync_zcopy_thresh = context->config.ext.zcopy_thresh;
                }
            }
        } else {
            config->max_am_bcopy = UCP_MIN_BCOPY; /* Stub endpoint */
        }
    }

    /* Configuration for remote memory access */
    for (lane = 0; lane < config->key.num_lanes; ++lane) {
        if (ucp_lane_map_get_lane(config->key.rma_lane_map, lane) == 0) {
            continue;
        }

        rma_config = &config->rma[lane];
        rsc_index  = config->key.lanes[lane];
        iface_attr = &worker->iface_attrs[rsc_index];
        if (rsc_index != UCP_NULL_RESOURCE) {
            if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_SHORT) {
                rma_config->max_put_short = iface_attr->cap.put.max_short;
            }
            if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_BCOPY) {
                rma_config->max_put_bcopy = iface_attr->cap.put.max_bcopy;
            }
            if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_BCOPY) {
                rma_config->max_get_bcopy = iface_attr->cap.get.max_bcopy;
            }
        } else {
            rma_config->max_put_bcopy = UCP_MIN_BCOPY; /* Stub endpoint */
        }
    }

    /* Configuration for Rendezvous data */
    if (config->key.rndv_lane != UCP_NULL_LANE) {
        lane        = config->key.rndv_lane;
        rsc_index   = config->key.lanes[lane];
        if (rsc_index != UCP_NULL_RESOURCE) {
            iface_attr = &worker->iface_attrs[rsc_index];
            md_attr    = &context->md_attrs[context->tl_rscs[rsc_index].md_index];
            ucs_assert_always(iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY);

            if (context->config.ext.rndv_thresh == UCS_CONFIG_MEMUNITS_AUTO) {
                /* auto */
                /* Make UCX calculate the rndv threshold on its own.
                 * We do it by finding the message size at which rndv and eager_zcopy get
                 * the same latency. Starting this message size (rndv_thresh), rndv's latency
                 * would be lower and the reached bandwidth would be higher.
                 * The latency function for eager_zcopy is:
                 * [ reg_cost.overhead + size * md_attr->reg_cost.growth +
                 * max(size/bw , size/bcopy_bw) + overhead ]
                 * The latency function for Rendezvous is:
                 * [ reg_cost.overhead + size * md_attr->reg_cost.growth + latency + overhead +
                 *   reg_cost.overhead + size * md_attr->reg_cost.growth + overhead + latency +
                 *   size/bw + latency + overhead + latency ]
                 * Isolating the 'size' yields the rndv_thresh.
                 * The used latency functions for eager_zcopy and rndv are also specified in
                 * the UCX wiki */
                numerator = ((2 * iface_attr->overhead) + (4 * iface_attr->latency) +
                             md_attr->reg_cost.overhead);
                denumerator = (ucs_max((1.0 / iface_attr->bandwidth),(1.0 / context->config.ext.bcopy_bw)) -
                               md_attr->reg_cost.growth - (1.0 / iface_attr->bandwidth));

                if (denumerator > 0) {
                    rndv_thresh = numerator / denumerator;
                    ucs_trace("rendezvous threshold is %zu ( = %f / %f)",
                              rndv_thresh, numerator, denumerator);
                } else {
                    rndv_thresh = context->config.ext.rndv_thresh_fallback;
                    ucs_trace("rendezvous threshold is %zu", rndv_thresh);
                }

                /* for the 'auto' mode in the rndv_threshold, we enforce the usage of rndv
                 * to a value that can be set by the user.
                 * to disable rndv, need to set a high value for the rndv_threshold
                 * (without the 'auto' mode)*/
                config->rndv_thresh        = rndv_thresh;
                config->sync_rndv_thresh   = rndv_thresh;
                config->max_rndv_get_zcopy = iface_attr->cap.get.max_zcopy;

            } else {
                config->rndv_thresh        = context->config.ext.rndv_thresh;
                config->sync_rndv_thresh   = context->config.ext.rndv_thresh;
                config->max_rndv_get_zcopy = iface_attr->cap.get.max_zcopy;
                ucs_trace("rendezvous threshold is %zu", config->rndv_thresh);
            }
        } else {
            ucs_debug("rendezvous protocol is not supported ");
        }
    }
}