Example #1
0
static ucs_status_t ucp_fill_resources(ucp_context_h context,
                                       const ucp_config_t *config)
{
    unsigned num_tl_resources;
    unsigned num_md_resources;
    uct_md_resource_desc_t *md_rscs;
    ucs_status_t status;
    ucp_rsc_index_t i;
    unsigned md_index;
    uct_md_h md;
    uct_md_config_t *md_config;
    uint64_t masks[UCT_DEVICE_TYPE_LAST] = {0};

    /* if we got here then num_resources > 0.
     * if the user's device list is empty, there is no match */
    if ((0 == config->devices[UCT_DEVICE_TYPE_NET].count) &&
        (0 == config->devices[UCT_DEVICE_TYPE_SHM].count) &&
        (0 == config->devices[UCT_DEVICE_TYPE_ACC].count) &&
        (0 == config->devices[UCT_DEVICE_TYPE_SELF].count)) {
        ucs_error("The device lists are empty. Please specify the devices you would like to use "
                  "or omit the UCX_*_DEVICES so that the default will be used.");
        status = UCS_ERR_NO_ELEM;
        goto err;
    }

    /* if we got here then num_resources > 0.
     * if the user's tls list is empty, there is no match */
    if (0 == config->tls.count) {
        ucs_error("The TLs list is empty. Please specify the transports you would like to use "
                  "or omit the UCX_TLS so that the default will be used.");
        status = UCS_ERR_NO_ELEM;
        goto err;
    }

    /* List memory domain resources */
    status = uct_query_md_resources(&md_rscs, &num_md_resources);
    if (status != UCS_OK) {
        goto err;
    }

    /* Sort md's by name, to increase the likelihood of reusing the same ep
     * configuration (since remote md map is part of the key).
     */
    qsort(md_rscs, num_md_resources, sizeof(*md_rscs), ucp_md_rsc_compare_name);

    /* Error check: Make sure there is at least one MD */
    if (num_md_resources == 0) {
        ucs_error("No md resources found");
        status = UCS_ERR_NO_DEVICE;
        goto err_release_md_resources;
    }

    context->num_mds  = 0;
    context->md_rscs  = NULL;
    context->mds      = NULL;
    context->md_attrs = NULL;
    context->num_tls  = 0;
    context->tl_rscs  = NULL;

    /* Allocate array of MD resources we would actually use */
    context->md_rscs = ucs_calloc(num_md_resources, sizeof(*context->md_rscs),
                                  "ucp_md_resources");
    if (context->md_rscs == NULL) {
        status = UCS_ERR_NO_MEMORY;
        goto err_free_context_resources;
    }

    /* Allocate array of memory domains */
    context->mds = ucs_calloc(num_md_resources, sizeof(*context->mds), "ucp_mds");
    if (context->mds == NULL) {
        status = UCS_ERR_NO_MEMORY;
        goto err_free_context_resources;
    }

    /* Allocate array of memory domains attributes */
    context->md_attrs = ucs_calloc(num_md_resources, sizeof(*context->md_attrs),
                                   "ucp_md_attrs");
    if (context->md_attrs == NULL) {
        status = UCS_ERR_NO_MEMORY;
        goto err_free_context_resources;
    }

    /* Open all memory domains, keep only those which have at least one TL
     * resources selected on them.
     */
    md_index = 0;
    for (i = 0; i < num_md_resources; ++i) {
        status = uct_md_config_read(md_rscs[i].md_name, NULL, NULL, &md_config);
        if (status != UCS_OK) {
            goto err_free_context_resources;
        }

        status = uct_md_open(md_rscs[i].md_name, md_config, &md);
        uct_config_release(md_config);
        if (status != UCS_OK) {
            goto err_free_context_resources;
        }

        context->md_rscs[md_index] = md_rscs[i];
        context->mds[md_index]     = md;

        /* Save MD attributes */
        status = uct_md_query(md, &context->md_attrs[md_index]);
        if (status != UCS_OK) {
            goto err_free_context_resources;
        }

        /* Add communication resources of each MD */
        status = ucp_add_tl_resources(context, md, md_index, config,
                                      &num_tl_resources, masks);
        if (status != UCS_OK) {
            goto err_free_context_resources;
        }

        /* If the MD does not have transport resources, don't use it */
        if (num_tl_resources > 0) {
            ++md_index;
            ++context->num_mds;
        } else {
            ucs_debug("closing md %s because it has no selected transport resources",
                      md_rscs[i].md_name);
            uct_md_close(md);
        }
    }

    /* Error check: Make sure there is at least one transport */
    if (0 == context->num_tls) {
        ucs_error("There are no available resources matching the configured criteria");
        status = UCS_ERR_NO_DEVICE;
        goto err_free_context_resources;
    }

    if (context->num_mds > UCP_MD_INDEX_BITS) {
        ucs_error("Only up to %d memory domains are supported (have: %d)",
                  UCP_MD_INDEX_BITS, context->num_mds);
        status = UCS_ERR_EXCEEDS_LIMIT;
        goto err_release_md_resources;
    }

    /* Notify the user if there are devices from the command line that are not available */
    ucp_check_unavailable_devices(config->devices, masks);

    /* Error check: Make sure there are not too many transports */
    if (context->num_tls >= UCP_MAX_RESOURCES) {
        ucs_error("Exceeded resources limit (%u requested, up to %d are supported)",
                  context->num_tls, UCP_MAX_RESOURCES);
        status = UCS_ERR_EXCEEDS_LIMIT;
        goto err_free_context_resources;
    }

    status = ucp_check_tl_names(context);
    if (status != UCS_OK) {
        goto err_free_context_resources;
    }

    uct_release_md_resource_list(md_rscs);
    return UCS_OK;

err_free_context_resources:
    ucp_free_resources(context);
err_release_md_resources:
    uct_release_md_resource_list(md_rscs);
err:
    return status;
}
Example #2
0
static ucs_status_t ucp_fill_resources(ucp_context_h context,
                                       const ucp_config_t *config)
{
    unsigned num_tl_resources;
    unsigned num_pd_resources;
    uct_pd_resource_desc_t *pd_rscs;
    ucs_status_t status;
    ucp_rsc_index_t i;
    unsigned pd_index;
    uct_pd_h pd;
    uct_pd_config_t *pd_config;
    uint64_t masks[UCT_DEVICE_TYPE_LAST] = {0};

    /* if we got here then num_resources > 0.
     * if the user's device list is empty, there is no match */
    if ((0 == config->devices[UCT_DEVICE_TYPE_NET].count) &&
        (0 == config->devices[UCT_DEVICE_TYPE_SHM].count) &&
        (0 == config->devices[UCT_DEVICE_TYPE_ACC].count)) {
        ucs_error("The device lists are empty. Please specify the devices you would like to use "
                  "or omit the UCX_*_DEVICES so that the default will be used.");
        status = UCS_ERR_NO_ELEM;
        goto err;
    }

    /* if we got here then num_resources > 0.
     * if the user's tls list is empty, there is no match */
    if (0 == config->tls.count) {
        ucs_error("The TLs list is empty. Please specify the transports you would like to use "
                  "or omit the UCX_TLS so that the default will be used.");
        status = UCS_ERR_NO_ELEM;
        goto err;
    }

    /* List protection domain resources */
    status = uct_query_pd_resources(&pd_rscs, &num_pd_resources);
    if (status != UCS_OK) {
        goto err;
    }

    /* Error check: Make sure there is at least one PD */
    if (num_pd_resources == 0) {
        ucs_error("No pd resources found");
        status = UCS_ERR_NO_DEVICE;
        goto err_release_pd_resources;
    }

    if (num_pd_resources >= UCP_MAX_PDS) {
        ucs_error("Only up to %ld PDs are supported", UCP_MAX_PDS);
        status = UCS_ERR_EXCEEDS_LIMIT;
        goto err_release_pd_resources;
    }

    context->num_pds  = 0;
    context->pd_rscs  = NULL;
    context->pds      = NULL;
    context->pd_attrs = NULL;
    context->num_tls  = 0;
    context->tl_rscs  = NULL;

    /* Allocate array of PD resources we would actually use */
    context->pd_rscs = ucs_calloc(num_pd_resources, sizeof(*context->pd_rscs),
                                  "ucp_pd_resources");
    if (context->pd_rscs == NULL) {
        status = UCS_ERR_NO_MEMORY;
        goto err_free_context_resources;
    }

    /* Allocate array of protection domains */
    context->pds = ucs_calloc(num_pd_resources, sizeof(*context->pds), "ucp_pds");
    if (context->pds == NULL) {
        status = UCS_ERR_NO_MEMORY;
        goto err_free_context_resources;
    }

    /* Allocate array of protection domains attributes */
    context->pd_attrs = ucs_calloc(num_pd_resources, sizeof(*context->pd_attrs),
                                   "ucp_pd_attrs");
    if (context->pd_attrs == NULL) {
        status = UCS_ERR_NO_MEMORY;
        goto err_free_context_resources;
    }

    /* Open all protection domains, keep only those which have at least one TL
     * resources selected on them.
     */
    pd_index = 0;
    for (i = 0; i < num_pd_resources; ++i) {
        status = uct_pd_config_read(pd_rscs[i].pd_name, NULL, NULL, &pd_config);
        if (status != UCS_OK) {
            goto err_free_context_resources;
        }

        status = uct_pd_open(pd_rscs[i].pd_name, pd_config, &pd);
        uct_config_release(pd_config);
        if (status != UCS_OK) {
            goto err_free_context_resources;
        }

        context->pd_rscs[pd_index] = pd_rscs[i];
        context->pds[pd_index]     = pd;

        /* Save PD attributes */
        status = uct_pd_query(pd, &context->pd_attrs[pd_index]);
        if (status != UCS_OK) {
            goto err_free_context_resources;
        }

        /* Add communication resources of each PD */
        status = ucp_add_tl_resources(context, pd, pd_index, config,
                                      &num_tl_resources, masks);
        if (status != UCS_OK) {
            goto err_free_context_resources;
        }

        /* If the PD does not have transport resources, don't use it */
        if (num_tl_resources > 0) {
            ++pd_index;
            ++context->num_pds;
        } else {
            ucs_debug("closing pd %s because it has no selected transport resources",
                      pd_rscs[i].pd_name);
            uct_pd_close(pd);
        }
    }

    /* Error check: Make sure there is at least one transport */
    if (0 == context->num_tls) {
        ucs_error("There are no available resources matching the configured criteria");
        status = UCS_ERR_NO_DEVICE;
        goto err_free_context_resources;
    }

    /* Notify the user if there are devices from the command line that are not available */
    ucp_check_unavailable_devices(config->devices, masks);

    /* Error check: Make sure there are not too many transports */
    if (context->num_tls >= UCP_MAX_RESOURCES) {
        ucs_error("Exceeded resources limit (%u requested, up to %d are supported)",
                  context->num_tls, UCP_MAX_RESOURCES);
        status = UCS_ERR_EXCEEDS_LIMIT;
        goto err_free_context_resources;
    }

    uct_release_pd_resource_list(pd_rscs);
    return UCS_OK;

err_free_context_resources:
    ucp_free_resources(context);
err_release_pd_resources:
    uct_release_pd_resource_list(pd_rscs);
err:
    return status;
}