Ejemplo n.º 1
0
/***   MODEX SECTION ***/
static int modex(orte_grpcomm_collective_t *coll)
{
    int *local_ranks, local_rank_count;
    opal_hwloc_locality_t locality;
    const char *cpuset;
    orte_process_name_t name;
    orte_vpid_t v;
    bool local;
    int rc, i;

    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:pmi: modex entered",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* discover the local ranks */
#if WANT_PMI2_SUPPORT
    {
        char *pmapping = (char*)malloc(PMI2_MAX_VALLEN);
        int found, sid, nodes, k;
        orte_vpid_t n;
        char *p;
        rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found);
        if (!found || PMI_SUCCESS != rc) { /* can't check PMI2_SUCCESS as some folks (i.e., Cray) don't define it */
            opal_output(0, "%s could not get PMI_process_mapping (PMI2_Info_GetJobAttr() failed)",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            return ORTE_ERROR;
        }

        i = 0; n = 0; local_rank_count = 0;
        if (NULL != (p = strstr(pmapping, "(vector"))) {
            while (NULL != (p = strstr(p+1, ",("))) {
                if (3 == sscanf(p, ",(%d,%d,%d)", &sid, &nodes, &local_rank_count)) {
                    for (k = 0; k < nodes; k++) {
                        if ((ORTE_PROC_MY_NAME->vpid >= n) &&
                            (ORTE_PROC_MY_NAME->vpid < (n + local_rank_count))) {
                            break;
                        }
                        n += local_rank_count;
                    }
                }
            }
        }
        free(pmapping);

        if (local_rank_count > 0) {
            local_ranks = (int*)malloc(local_rank_count * sizeof(int));
            for (i=0; i < local_rank_count; i++) {
                local_ranks[i] = n + i;
            }
        }

        if (NULL == local_ranks) {
            opal_output(0, "%s could not get PMI_process_mapping",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            return ORTE_ERROR;
        }
    }
#else
    rc = PMI_Get_clique_size (&local_rank_count);
    if (PMI_SUCCESS != rc) {
	ORTE_ERROR_LOG(ORTE_ERROR);
	return ORTE_ERROR;
    }

    local_ranks = calloc (local_rank_count, sizeof (int));
    if (NULL == local_ranks) {
	ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
	return ORTE_ERR_OUT_OF_RESOURCE;
    }

    rc = PMI_Get_clique_ranks (local_ranks, local_rank_count);
    if (PMI_SUCCESS != rc) {
	ORTE_ERROR_LOG(ORTE_ERROR);
	return ORTE_ERROR;
    }
#endif


    /* our RTE data was constructed and pushed in the ESS pmi component */

    /* commit our modex info */
    opal_db.commit((opal_identifier_t *)ORTE_PROC_MY_NAME);

    /* cycle thru all my peers and collect their RTE info */
    name.jobid = ORTE_PROC_MY_NAME->jobid;
    for (v=0; v < orte_process_info.num_procs; v++) {
        if (v == ORTE_PROC_MY_NAME->vpid) {
            continue;
        }
        name.vpid = v;

	/* check if this is a local process */
	for (i = 0, local = false ; i < local_rank_count ; ++i) {
	    if ((orte_vpid_t) local_ranks[i] == v) {
		local = true;
		break;
	    }
	}

	/* compute and store the locality as it isn't something that gets pushed to PMI  - doing
         * this here will prevent the MPI layer from fetching data for all ranks
         */
        if (local) {
	    if (ORTE_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)&name, OPAL_DB_CPUSET,
                                                            (void **)&cpuset, OPAL_STRING))) {
		ORTE_ERROR_LOG(rc);
		return rc;
	    }

	    if (NULL == cpuset) {
		/* if we share a node, but we don't know anything more, then
		 * mark us as on the node as this is all we know
		 */
		locality = OPAL_PROC_ON_NODE;
	    } else {
		/* determine relative location on our node */
		locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
								 orte_process_info.cpuset,
								 (char *) cpuset);
	    }
	} else {
            /* this is on a different node, then mark as non-local */
            locality = OPAL_PROC_NON_LOCAL;
	}

        OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
                            "%s grpcomm:pmi proc %s locality %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&name), opal_hwloc_base_print_locality(locality)));

        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_SCOPE_INTERNAL,
                                                OPAL_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    }

    /* execute the callback */
    coll->active = false;
    if (NULL != coll->cbfunc) {
        coll->cbfunc(NULL, coll->cbdata);
    }
    return rc;
}
Ejemplo n.º 2
0
static int cray_fence(opal_process_name_t *procs, size_t nprocs)
{
    int rc;
    int32_t i;
    opal_value_t *kp, kvn;
    opal_hwloc_locality_t locality;

    opal_output_verbose(10, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray called fence",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

    /* check if there is partially filled meta key and put them */
    if (0 != pmix_packed_data_offset && NULL != pmix_packed_data) {
        opal_pmix_base_commit_packed(pmix_packed_data, pmix_packed_data_offset, pmix_vallen_max, &pmix_pack_key, kvs_put);
        pmix_packed_data_offset = 0;
        free(pmix_packed_data);
        pmix_packed_data = NULL;
    }

    if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) {
        OPAL_PMI_ERROR(rc, "PMI2_KVS_Fence");
        return OPAL_ERROR;
    }

    opal_output_verbose(10, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray kvs_fence complete",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

    /* get the modex data from each local process and set the
     * localities to avoid having the MPI layer fetch data
     * for every process in the job */
    if (!pmix_got_modex_data) {
        pmix_got_modex_data = true;
        /* we only need to set locality for each local rank as "not found"
         * equates to "non-local" */
        for (i=0; i < pmix_nlranks; i++) {
            pmix_pname.vid = pmix_lranks[i];
            rc = opal_pmix_base_cache_keys_locally((opal_identifier_t*)&pmix_pname, OPAL_DSTORE_CPUSET,
                                                   &kp, pmix_kvs_name, pmix_vallen_max, kvs_get);
            if (OPAL_SUCCESS != rc) {
                OPAL_ERROR_LOG(rc);
                return rc;
            }
#if OPAL_HAVE_HWLOC
            if (NULL == kp || NULL == kp->data.string) {
                /* if we share a node, but we don't know anything more, then
                 * mark us as on the node as this is all we know
                 */
                locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 opal_process_info.cpuset,
                                                                 kp->data.string);
            }
            if (NULL != kp) {
                OBJ_RELEASE(kp);
            }
#else
            /* all we know is we share a node */
            locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
#endif
            OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
                                 "%s pmix:s2 proc %s locality %s",
                                 OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                 OPAL_NAME_PRINT(*(opal_identifier_t*)&pmix_pname),
                                 opal_hwloc_base_print_locality(locality)));

            OBJ_CONSTRUCT(&kvn, opal_value_t);
            kvn.key = strdup(OPAL_DSTORE_LOCALITY);
            kvn.type = OPAL_UINT16;
            kvn.data.uint16 = locality;
            (void)opal_dstore.store(opal_dstore_internal, (opal_identifier_t*)&pmix_pname, &kvn);
            OBJ_DESTRUCT(&kvn);
        }
    }

    return OPAL_SUCCESS;
}
Ejemplo n.º 3
0
static int s1_fence(opal_list_t *procs, int collect_data)
{
    int rc;
    int32_t i;
    opal_value_t *kp, kvn;
    opal_hwloc_locality_t locality;
    opal_process_name_t s1_pname;

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:s1 called fence",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

    /* use the PMI barrier function */
    if (PMI_SUCCESS != (rc = PMI_Barrier())) {
        OPAL_PMI_ERROR(rc, "PMI_Barrier");
        return OPAL_ERROR;
    }

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:s1 barrier complete",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

    /* get the modex data from each local process and set the
     * localities to avoid having the MPI layer fetch data
     * for every process in the job */
    s1_pname.jobid = OPAL_PROC_MY_NAME.jobid;
    if (!got_modex_data) {
        got_modex_data = true;
        /* we only need to set locality for each local rank as "not found"
         * equates to "non-local" */
        for (i=0; i < nlranks; i++) {
            s1_pname.vpid = lranks[i];
            rc = opal_pmix_base_cache_keys_locally(&s1_pname, OPAL_PMIX_CPUSET,
                                                   &kp, pmix_kvs_name, pmix_vallen_max, kvs_get);
            if (OPAL_SUCCESS != rc) {
                OPAL_ERROR_LOG(rc);
                return rc;
            }
            if (NULL == kp || NULL == kp->data.string) {
                /* if we share a node, but we don't know anything more, then
                 * mark us as on the node as this is all we know
                 */
                locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 opal_process_info.cpuset,
                                                                 kp->data.string);
            }
            if (NULL != kp) {
                OBJ_RELEASE(kp);
            }
            OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
                                 "%s pmix:s1 proc %s locality %s",
                                 OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                 OPAL_NAME_PRINT(s1_pname),
                                 opal_hwloc_base_print_locality(locality)));

            OBJ_CONSTRUCT(&kvn, opal_value_t);
            kvn.key = strdup(OPAL_PMIX_LOCALITY);
            kvn.type = OPAL_UINT16;
            kvn.data.uint16 = locality;
            opal_pmix_base_store(&s1_pname, &kvn);
            OBJ_DESTRUCT(&kvn);
        }
    }

    return OPAL_SUCCESS;
}
Ejemplo n.º 4
0
static int cray_fence(opal_list_t *procs, int collect_data)
{
    int rc, cnt;
    int32_t i;
    int *all_lens = NULL;
    opal_value_t *kp, kvn;
    opal_buffer_t *send_buffer = NULL;
    opal_buffer_t *buf = NULL;
    void *sbuf_ptr;
    char *cptr, *rcv_buff = NULL;
    opal_process_name_t id;
    typedef struct {
        uint32_t pmix_rank;
        opal_process_name_t name;
        int32_t nbytes;
    } bytes_and_rank_t;
    int32_t rcv_nbytes_tot;
    bytes_and_rank_t s_bytes_and_rank;
    bytes_and_rank_t *r_bytes_and_ranks = NULL;
    opal_hwloc_locality_t locality;
    opal_list_t vals;
    char *cpuset = NULL;
    opal_process_name_t pname;

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray executing fence cache_global %p cache_local %p",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                        (void *)mca_pmix_cray_component.cache_global,
                        (void *)mca_pmix_cray_component.cache_local);

    /* get the modex data from each local process and set the
     * localities to avoid having the MPI layer fetch data
     * for every process in the job */
    pname.jobid = OPAL_PROC_MY_NAME.jobid;

    /*
     * "unload" the cache_local/cache_global buffers, first copy
     * it so we can continue to use the local buffers if further
     * calls to put can be made
     */

    send_buffer = OBJ_NEW(opal_buffer_t);
    if (NULL == send_buffer) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    opal_dss.copy_payload(send_buffer, mca_pmix_cray_component.cache_global);
    opal_dss.unload(send_buffer, &sbuf_ptr, &s_bytes_and_rank.nbytes);
    s_bytes_and_rank.pmix_rank = pmix_rank;
    s_bytes_and_rank.name = OPAL_PROC_MY_NAME;

    r_bytes_and_ranks = (bytes_and_rank_t *)malloc(pmix_size * sizeof(bytes_and_rank_t));
    if (NULL == r_bytes_and_ranks) {
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        goto fn_exit;
    }

    /*
     * gather up all the buffer sizes and rank order.
     * doing this step below since the cray pmi PMI_Allgather doesn't deliver
     * the gathered data necessarily in PMI rank order, although the order stays
     * the same for the duration of a job - assuming no node failures.
     */

    if (PMI_SUCCESS != (rc = PMI_Allgather(&s_bytes_and_rank,r_bytes_and_ranks,sizeof(bytes_and_rank_t)))) {
        OPAL_PMI_ERROR(rc,"PMI_Allgather");
        rc = OPAL_ERR_COMM_FAILURE;
        goto fn_exit;
    }


    for (rcv_nbytes_tot=0,i=0; i < pmix_size; i++) {
        rcv_nbytes_tot += r_bytes_and_ranks[i].nbytes;
    }

    opal_output_verbose(20, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray total number of bytes to receive %d",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), rcv_nbytes_tot);

    rcv_buff = (char *) malloc(rcv_nbytes_tot * sizeof(char));
    if (NULL == rcv_buff) {
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        goto fn_exit;
    }

    all_lens = (int *)malloc(sizeof(int) * pmix_size);
    if (NULL == all_lens) {
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        goto fn_exit;
    }
    for (i=0; i< pmix_size; i++) {
        all_lens[r_bytes_and_ranks[i].pmix_rank] = r_bytes_and_ranks[i].nbytes;
    }

    if (PMI_SUCCESS != (rc = PMI_Allgatherv(sbuf_ptr,s_bytes_and_rank.nbytes,rcv_buff,all_lens))) {
        OPAL_PMI_ERROR(rc,"PMI_Allgatherv");
        rc = OPAL_ERR_COMM_FAILURE;
        goto fn_exit;
    }

    OBJ_RELEASE(send_buffer);
    send_buffer  = NULL;

    buf = OBJ_NEW(opal_buffer_t);
    if (buf == NULL) {
        rc = OPAL_ERR_OUT_OF_RESOURCE;
        goto fn_exit;
    }

    for (cptr = rcv_buff, i=0; i < pmix_size; i++) {

        id = r_bytes_and_ranks[i].name;

        buf->base_ptr = NULL;  /* TODO: ugh */
        if (OPAL_SUCCESS != (rc = opal_dss.load(buf, (void *)cptr, r_bytes_and_ranks[i].nbytes))) {
            OPAL_PMI_ERROR(rc,"pmix:cray opal_dss.load failed");
            goto fn_exit;
        }

        /* unpack and stuff in to the dstore */

        cnt = 1;
        while (OPAL_SUCCESS == (rc = opal_dss.unpack(buf, &kp, &cnt, OPAL_VALUE))) {
            opal_output_verbose(20, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray unpacked kp with key %s type(%d) for id  %s",
                         OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key, kp->type, OPAL_NAME_PRINT(id));
            if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&id, kp))) {
                OPAL_ERROR_LOG(rc);
                goto fn_exit;
            }
             OBJ_RELEASE(kp);
             cnt = 1;
        }

        cptr += r_bytes_and_ranks[i].nbytes;

    }

    buf->base_ptr = NULL;  /* TODO: ugh */
    OBJ_RELEASE(buf);

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:cray kvs_fence complete",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

#if OPAL_HAVE_HWLOC
    /* fetch my cpuset */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS == (rc = opal_pmix_base_fetch(&pmix_pname,
                                                   OPAL_PMIX_CPUSET, &vals))) {
        kp = (opal_value_t*)opal_list_get_first(&vals);
        cpuset = strdup(kp->data.string);
    } else {
        cpuset = NULL;
    }
    OPAL_LIST_DESTRUCT(&vals);
#endif

    /* we only need to set locality for each local rank as "not found"
     * equates to "non-local" */
    for (i=0; i < pmix_nlranks; i++) {
        id.vpid = pmix_lranks[i];
        id.jobid = pmix_jobid;
        opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s checking out if %s is local to me",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                OPAL_NAME_PRINT(id));
        /* fetch cpuset for this vpid */
#if OPAL_HAVE_HWLOC
        OBJ_CONSTRUCT(&vals, opal_list_t);
        if (OPAL_SUCCESS != (rc = opal_pmix_base_fetch(&id,
                                                    OPAL_PMIX_CPUSET, &vals))) {
            opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s cpuset for local proc %s not found",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                OPAL_NAME_PRINT(id));
            OPAL_LIST_DESTRUCT(&vals);
            /* even though the cpuset wasn't found, we at least know it is
             * on the same node with us */
            locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
        } else {
            kp = (opal_value_t*)opal_list_get_first(&vals);
            if (NULL == kp->data.string) {
                /* if we share a node, but we don't know anything more, then
                 * mark us as on the node as this is all we know
                 */
                locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 cpuset,
                                                                 kp->data.string);
            }
            OPAL_LIST_DESTRUCT(&vals);
        }
#else
        /* all we know is we share a node */
        locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
#endif
        OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
                             "%s pmix:cray proc %s locality %s",
                             OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                             OPAL_NAME_PRINT(id),
                             opal_hwloc_base_print_locality(locality)));

        OBJ_CONSTRUCT(&kvn, opal_value_t);
        kvn.key = strdup(OPAL_PMIX_LOCALITY);
        kvn.type = OPAL_UINT16;
        kvn.data.uint16 = locality;
        opal_pmix_base_store(&pname, &kvn);
        OBJ_DESTRUCT(&kvn);
    }

fn_exit:
#if OPAL_HAVE_HWLOC
    if (NULL != cpuset) {
        free(cpuset);
    }
#endif
    if (all_lens != NULL) {
        free(all_lens);
    }
    if (rcv_buff != NULL) {
        free(rcv_buff);
    }
    if (r_bytes_and_ranks != NULL) {
        free(r_bytes_and_ranks);
    }
    return rc;
}
Ejemplo n.º 5
0
/*
 * Function for selecting one component from all those that are
 * available.
 */
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
{
    orte_job_t *jdata;
    orte_job_map_t *map;
    int rc;
    bool did_map;
    opal_list_item_t *item;
    orte_rmaps_base_selected_module_t *mod;
    orte_job_t *parent;
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    /* convenience */
    jdata = caddy->jdata;
    jdata->state = ORTE_JOB_STATE_MAP;

    /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT
     * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN
     * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP
     */
    
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
     * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
     * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
     * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A
     * NULL MAP FIELD
     * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY
     * DIDN'T SET IT
     */        
    if (NULL == jdata->map) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps: creating new map for job %s",
                            ORTE_JOBID_PRINT(jdata->jobid));
        /* create a map object where we will store the results */
        map = OBJ_NEW(orte_job_map_t);
        if (NULL == map) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
        /* load it with the system defaults */
        map->mapping = orte_rmaps_base.mapping;
        map->ranking = orte_rmaps_base.ranking;
#if OPAL_HAVE_HWLOC
        map->binding = opal_hwloc_binding_policy;
#endif
        if (NULL != orte_rmaps_base.ppr) {
            map->ppr = strdup(orte_rmaps_base.ppr);
        }
        map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
        map->display_map = orte_rmaps_base.display_map;
        /* assign the map object to this job */
        jdata->map = map;
    } else {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps: setting mapping policies for job %s",
                            ORTE_JOBID_PRINT(jdata->jobid));

        if (!jdata->map->display_map) {
            jdata->map->display_map = orte_rmaps_base.display_map;
        }
        /* set the default mapping policy IFF it wasn't provided */
        if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping);
        }
        if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
            ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping));
        }
        /* ditto for rank and bind policies */
        if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
            ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking);
        }
#if OPAL_HAVE_HWLOC
        if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
            jdata->map->binding = opal_hwloc_binding_policy;
        }
#endif
    }

#if OPAL_HAVE_HWLOC
    /* if we are not going to launch, then we need to set any
     * undefined topologies to match our own so the mapper
     * can operate
     */
    if (orte_do_not_launch) {
        orte_node_t *node;
        hwloc_topology_t t0;
        int i;
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        t0 = node->topology;
        for (i=1; i < orte_node_pool->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                continue;
            }
            if (NULL == node->topology) {
                node->topology = t0;
            }
        }
    }
#endif

    /* cycle thru the available mappers until one agrees to map
     * the job
     */
    did_map = false;
    if (1 == opal_list_get_size(&orte_rmaps_base.selected_modules)) {
        /* forced selection */
        mod = (orte_rmaps_base_selected_module_t*)opal_list_get_first(&orte_rmaps_base.selected_modules);
        jdata->map->req_mapper = strdup(mod->component->mca_component_name);
    }
    for (item = opal_list_get_first(&orte_rmaps_base.selected_modules);
         item != opal_list_get_end(&orte_rmaps_base.selected_modules);
         item = opal_list_get_next(item)) {
        mod = (orte_rmaps_base_selected_module_t*)item;
        if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata)) ||
            ORTE_ERR_RESOURCE_BUSY == rc) {
            did_map = true;
            break;
        }
        /* mappers return "next option" if they didn't attempt to
         * map the job. anything else is a true error.
         */
        if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
            ORTE_ERROR_LOG(rc);
            ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
    }
    if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) {
        /* the map was done but nothing could be mapped
         * for launch as all the resources were busy
         */
        OBJ_RELEASE(caddy);
        return;
    }

    /* if we get here without doing the map, or with zero procs in
     * the map, then that's an error
     */
    if (!did_map || 0 == jdata->num_procs) {
        orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }

    /* compute and save local ranks */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
        ORTE_ERROR_LOG(rc);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    
#if OPAL_HAVE_HWLOC
    /* compute and save bindings */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
        ORTE_ERROR_LOG(rc);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
#endif
    
    /* set the offset so shared memory components can potentially
     * connect to any spawned jobs
     */
    jdata->offset = orte_total_procs;
    /* track the total number of procs launched by us */
    orte_total_procs += jdata->num_procs;

    /* if it is a dynamic spawn, save the bookmark on the parent's job too */
    if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
        if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) {
            parent->bookmark = jdata->bookmark;
        }
    }

    /* if we wanted to display the map, now is the time to do it - ignore
     * daemon job
     */
    if (jdata->map->display_map) {
        char *output=NULL;
        int i, j;
        orte_node_t *node;
        orte_proc_t *proc;

        if (orte_display_diffable_output) {
            /* intended solely to test mapping methods, this output
             * can become quite long when testing at scale. Rather
             * than enduring all the malloc/free's required to
             * create an arbitrary-length string, custom-generate
             * the output a line at a time here
             */
            /* display just the procs in a diffable format */
            opal_output(orte_clean_output, "<map>\n\t<jobid=%s>\n\t<offset=%s>",
                        ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset));
            fflush(stderr);
            /* loop through nodes */
            for (i=0; i < jdata->map->nodes->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
                    continue;
                }
                opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name);
                fflush(stderr);
                for (j=0; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
#if OPAL_HAVE_HWLOC
                    {
                        char locale[64];

                        if (NULL != proc->locale) {
                            hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset);
                        }
                        opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s>",
                                    ORTE_VPID_PRINT(proc->name.vpid),  (long)proc->app_idx,
                                    (unsigned long)proc->local_rank,
                                    (unsigned long)proc->node_rank, locale,
                                    (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap);
                    }
#else
                    opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>",
                                ORTE_VPID_PRINT(proc->name.vpid),  (long)proc->app_idx,
                                (unsigned long)proc->local_rank,
                                (unsigned long)proc->node_rank);
#endif
                    fflush(stderr);
                }
                opal_output(orte_clean_output, "\t</host>");
                fflush(stderr);
            }
#if OPAL_HAVE_HWLOC
            {
                opal_hwloc_locality_t locality;
                orte_proc_t *p0;

                /* test locality - for the first node, print the locality of each proc relative to the first one */
                node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0);
                p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0);
                opal_output(orte_clean_output, "\t<locality>");
                for (j=1; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    locality = opal_hwloc_base_get_relative_locality(node->topology,
                                                                     p0->cpu_bitmap,
                                                                     proc->cpu_bitmap);
                    opal_output(orte_clean_output, "\t\t<rank=%s rank=%s locality=%s>",
                                ORTE_VPID_PRINT(p0->name.vpid),
                                ORTE_VPID_PRINT(proc->name.vpid),
                                opal_hwloc_base_print_locality(locality));
                }
                opal_output(orte_clean_output, "\t</locality>\n</map>");
                fflush(stderr);
            }
#else
            opal_output(orte_clean_output, "\n</map>");
            fflush(stderr);
#endif
        } else {
            opal_output(orte_clean_output, " Data for JOB %s offset %s", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset));
            opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
            if (orte_xml_output) {
                fprintf(orte_xml_fp, "%s\n", output);
                fflush(orte_xml_fp);
            } else {
                opal_output(orte_clean_output, "%s", output);
            }
            free(output);
        }
    }
    /* set the job state to the next position */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE);

    /* cleanup */
    OBJ_RELEASE(caddy);
}
Ejemplo n.º 6
0
static bool native_get_attr(const char *attr, opal_value_t **kv)
{
    opal_buffer_t *msg, *bptr;
    opal_list_t vals;
    opal_value_t *kp, *lclpeers=NULL, kvn;
    pmix_cmd_t cmd = PMIX_GETATTR_CMD;
    char **ranks;
    int rc, ret;
    int32_t cnt;
    bool found=false;
    opal_hwloc_locality_t locality;
    pmix_cb_t *cb;
    uint32_t i, myrank;
    opal_process_name_t id;
    char *cpuset;
    opal_buffer_t buf, buf2;

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:native get_attr called",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

    /* try to retrieve the requested value from the dstore */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS == opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, attr, &vals)) {
        *kv = (opal_value_t*)opal_list_remove_first(&vals);
        OPAL_LIST_DESTRUCT(&vals);
        return true;
    }

    if (NULL == mca_pmix_native_component.uri) {
        /* no server available, so just return */
        return false;
    }

    /* if the value isn't yet available, then we should try to retrieve
     * all the available attributes and store them for future use */
    msg = OBJ_NEW(opal_buffer_t);
    /* pack the cmd */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(msg, &cmd, 1, PMIX_CMD_T))) {
        OPAL_ERROR_LOG(rc);
        OBJ_RELEASE(msg);
        return false;
    }

    /* create a callback object as we need to pass it to the
     * recv routine so we know which callback to use when
     * the return message is recvd */
    cb = OBJ_NEW(pmix_cb_t);
    cb->active = true;

    /* push the message into our event base to send to the server */
    PMIX_ACTIVATE_SEND_RECV(msg, wait_cbfunc, cb);

    /* wait for the data to return */
    PMIX_WAIT_FOR_COMPLETION(cb->active);

    /* we have received the entire data blob for this process - unpack
     * and cache all values, keeping the one we requested to return
     * to the caller */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&cb->data, &ret, &cnt, OPAL_INT))) {
        OPAL_ERROR_LOG(rc);
        OBJ_RELEASE(cb);
        return false;
    }
    if (OPAL_SUCCESS == ret) {
        /* unpack the buffer containing the values */
        cnt = 1;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) {
            OPAL_ERROR_LOG(rc);
            OBJ_RELEASE(cb);
            return false;
        }
        cnt = 1;
        while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) {
            opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s unpacked attr %s",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key);
            /* if this is the local topology, we need to save it in a special way */
#if OPAL_HAVE_HWLOC
            {
                hwloc_topology_t topo;
                if (0 == strcmp(PMIX_LOCAL_TOPO, kp->key)) {
                    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                        "%s saving topology",
                                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
                    /* transfer the byte object for unpacking */
                    OBJ_CONSTRUCT(&buf, opal_buffer_t);
                    opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
                    kp->data.bo.bytes = NULL;  // protect the data region
                    kp->data.bo.size = 0;
                    OBJ_RELEASE(kp);
                    /* extract the topology */
                    cnt=1;
                    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &topo, &cnt, OPAL_HWLOC_TOPO))) {
                        OPAL_ERROR_LOG(rc);
                        OBJ_DESTRUCT(&buf);
                        continue;
                    }
                    OBJ_DESTRUCT(&buf);
                    if (NULL == opal_hwloc_topology) {
                        opal_hwloc_topology = topo;
                    } else {
                        hwloc_topology_destroy(topo);
                    }
                    cnt = 1;
                    continue;
                }
            }
#endif
            /* if this is the local cpuset blob, then unpack and store its contents */
            if (0 == strcmp(PMIX_LOCAL_CPUSETS, kp->key)) {
                opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                    "%s received local cpusets",
                                    OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
                /* transfer the byte object for unpacking */
                OBJ_CONSTRUCT(&buf, opal_buffer_t);
                opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
                kp->data.bo.bytes = NULL;  // protect the data region
                kp->data.bo.size = 0;
                OBJ_RELEASE(kp);
                cnt=1;
                while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &id, &cnt, OPAL_NAME))) {
                    cnt=1;
                    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &cpuset, &cnt, OPAL_STRING))) {
                        OPAL_ERROR_LOG(rc);
                        OBJ_DESTRUCT(&buf);
                        cnt = 1;
                        continue;
                    }
                    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                        "%s saving cpuset %s for local peer %s",
                                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                        (NULL == cpuset) ? "NULL" : cpuset,
                                        OPAL_NAME_PRINT(id));
                    OBJ_CONSTRUCT(&kvn, opal_value_t);
                    kvn.key = strdup(OPAL_DSTORE_CPUSET);
                    kvn.type = OPAL_STRING;
                    kvn.data.string = cpuset;
                    if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, &kvn))) {
                        OPAL_ERROR_LOG(rc);
                        OBJ_DESTRUCT(&kvn);
                        cnt = 1;
                        continue;
                    }
                    OBJ_DESTRUCT(&kvn);
                }
                OBJ_DESTRUCT(&buf);
                if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
                    OPAL_ERROR_LOG(rc);
                    return false;
                }
                cnt=1;
                continue;
            } else if (0 == strcmp(PMIX_PROC_MAP, kp->key)) {
                opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                    "%s received proc map",
                                    OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
                /* transfer the byte object for unpacking */
                OBJ_CONSTRUCT(&buf, opal_buffer_t);
                opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
                kp->data.bo.bytes = NULL;  // protect the data region
                kp->data.bo.size = 0;
                OBJ_RELEASE(kp);
                /* get the jobid */
                cnt=1;
                if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
                    OPAL_ERROR_LOG(rc);
                    OBJ_DESTRUCT(&buf);
                    cnt = 1;
                    return false;
                }
                if (0 != strcmp(PMIX_JOBID, kp->key)) {
                    OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
                    OBJ_DESTRUCT(&buf);
                    OBJ_RELEASE(kp);
                    cnt = 1;
                    return false;
                }
                id.jobid = kp->data.uint32;
                OBJ_RELEASE(kp);
                /* unpack the data for each rank */
                cnt=1;
                while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
                    if (0 != strcmp(PMIX_RANK, kp->key)) {
                        OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
                        OBJ_DESTRUCT(&buf);
                        OBJ_RELEASE(kp);
                        cnt = 1;
                        return false;
                    }
                    id.vpid = kp->data.uint32;
                    /* unpack the blob for this rank */
                    cnt=1;
                    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
                        OPAL_ERROR_LOG(rc);
                        OBJ_DESTRUCT(&buf);
                        cnt = 1;
                        return false;
                    }
                    if (0 != strcmp(PMIX_PROC_MAP, kp->key)) {
                        OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
                        OBJ_DESTRUCT(&buf);
                        OBJ_RELEASE(kp);
                        cnt = 1;
                        return false;
                    }
                    /* transfer the byte object for unpacking */
                    OBJ_CONSTRUCT(&buf2, opal_buffer_t);
                    opal_dss.load(&buf2, kp->data.bo.bytes, kp->data.bo.size);
                    kp->data.bo.bytes = NULL;  // protect the data region
                    kp->data.bo.size = 0;
                    OBJ_RELEASE(kp);
                    /* unpack and store the map */
                    cnt=1;
                    while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf2, &kp, &cnt, OPAL_VALUE))) {
                        opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                            "%s storing key %s for peer %s",
                                            OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                            kp->key, OPAL_NAME_PRINT(id));
                        if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, kp))) {
                            OPAL_ERROR_LOG(rc);
                            OBJ_RELEASE(kp);
                            OBJ_DESTRUCT(&buf2);
                            return false;
                        }
                    }
                    OBJ_DESTRUCT(&buf2);
                    if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
                        OPAL_ERROR_LOG(rc);
                        return false;
                    }
                    cnt=1;
                }
                OBJ_DESTRUCT(&buf);
                if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
                    OPAL_ERROR_LOG(rc);
                    return false;
                }
                cnt=1;
                continue;
            }
            /* otherwise, it is a single piece of info, so store it */
            if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &OPAL_PROC_MY_NAME, kp))) {
                OPAL_ERROR_LOG(rc);
                OBJ_RELEASE(kp);
                cnt = 1;
                continue;
            }
            /* save the list of local peers */
            if (0 == strcmp(PMIX_LOCAL_PEERS, kp->key)) {
                OBJ_RETAIN(kp);
                lclpeers = kp;
                opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                    "%s saving local peers %s",
                                    OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), lclpeers->data.string);
            } else if (0 == strcmp(PMIX_JOBID, kp->key)) {
                native_pname.jobid = kp->data.uint32;
            } else if (0 == strcmp(PMIX_RANK, kp->key)) {
                native_pname.vpid = kp->data.uint32;
            }
            if (0 == strcmp(attr, kp->key)) {
                OBJ_RETAIN(kp);
                *kv = kp;
                found = true;
            }
            OBJ_RELEASE(kp);
            cnt = 1;
        }
        OBJ_RELEASE(bptr);
        if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
            OPAL_ERROR_LOG(rc);
            return false;
        }
    } else {
        OPAL_ERROR_LOG(ret);
        OBJ_RELEASE(cb);
        return false;
    }
    OBJ_RELEASE(cb);
    opal_proc_set_name(&native_pname);

    /* if the list of local peers wasn't included, then we are done */
    if (NULL == lclpeers) {
        opal_output_verbose(0, opal_pmix_base_framework.framework_output,
                            "%s no local peers reported",
                            OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
        return found;
    }

    /* baseline all the procs as nonlocal */
    myrank = native_pname.vpid;
    id.jobid = native_pname.jobid;

#if OPAL_HAVE_HWLOC
    /* fetch my cpuset */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS == (rc = opal_dstore.fetch(opal_dstore_internal, &native_pname,
                                                OPAL_DSTORE_CPUSET, &vals))) {
        kp = (opal_value_t*)opal_list_get_first(&vals);
        cpuset = strdup(kp->data.string);
    } else {
        cpuset = NULL;
    }
    OPAL_LIST_DESTRUCT(&vals);
#endif

    /* we only need to set locality for each local rank as "not found"
     * equates to "non local" */
    ranks = opal_argv_split(lclpeers->data.string, ',');
    for (i=0; NULL != ranks[i]; i++) {
        uint32_t vid = strtoul(ranks[i], NULL, 10);
        if (myrank == vid) {
            continue;
        }
        id.vpid = vid;
#if OPAL_HAVE_HWLOC
        OBJ_CONSTRUCT(&vals, opal_list_t);
        if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &id,
                                                    OPAL_DSTORE_CPUSET, &vals))) {
            opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s cpuset for local proc %s not found",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                OPAL_NAME_PRINT(id));
            OPAL_LIST_DESTRUCT(&vals);
            /* even though the cpuset wasn't found, we at least know it is
             * on the same node with us */
            locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
        } else {
            kp = (opal_value_t*)opal_list_get_first(&vals);
            if (NULL == kp->data.string) {
                /* if we share a node, but we don't know anything more, then
                 * mark us as on the node as this is all we know
                 */
                locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 cpuset,
                                                                 kp->data.string);
            }
            OPAL_LIST_DESTRUCT(&vals);
        }
#else
        /* all we know is we share a node */
        locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
#endif
        OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
                             "%s pmix:native proc %s locality %s",
                             OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                             OPAL_NAME_PRINT(id),
                             opal_hwloc_base_print_locality(locality)));

        OBJ_CONSTRUCT(&kvn, opal_value_t);
        kvn.key = strdup(OPAL_DSTORE_LOCALITY);
        kvn.type = OPAL_UINT16;
        kvn.data.uint16 = locality;
        (void)opal_dstore.store(opal_dstore_internal, &id, &kvn);
        OBJ_DESTRUCT(&kvn);
    }
#if OPAL_HAVE_HWLOC
    if (NULL != cpuset) {
        free(cpuset);
    }
#endif
    opal_argv_free(ranks);

    return found;
}
Ejemplo n.º 7
0
/* only APPS call this function - daemons have their own */
int orte_util_decode_pidmap(opal_byte_object_t *bo)
{
    orte_vpid_t i, num_procs, *vptr, daemon;
    orte_vpid_t *daemons=NULL;
    orte_local_rank_t *local_rank=NULL;
    orte_node_rank_t *node_rank=NULL;
#if OPAL_HAVE_HWLOC
    opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr;
    unsigned int *bind_idx=NULL, pbidx, *uiptr;
#endif
    opal_hwloc_locality_t locality;
    orte_std_cntr_t n;
    opal_buffer_t buf;
    int rc;
    orte_proc_state_t *states = NULL;
    orte_app_idx_t *app_idx = NULL;
    int32_t *restarts = NULL;
    orte_process_name_t proc, dmn;
    orte_namelist_t *nm;
    opal_list_t jobs;
    char *hostname;

    /* xfer the byte object to a buffer for unpacking */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORTE_SUCCESS != (rc = opal_dss.load(&buf, bo->bytes, bo->size))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    n = 1;
    /* cycle through the buffer */
    OBJ_CONSTRUCT(&jobs, opal_list_t);
    while (ORTE_SUCCESS == (rc = opal_dss.unpack(&buf, &proc.jobid, &n, ORTE_JOBID))) {
        OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                             "%s orte:util:decode:pidmap working job %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(proc.jobid)));
        /* record the jobid */
        nm = OBJ_NEW(orte_namelist_t);
        nm->name.jobid = proc.jobid;
        opal_list_append(&jobs, &nm->super);

        /* unpack and store the number of procs */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_procs, &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        proc.vpid = ORTE_VPID_INVALID;
        if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NPROCS, &num_procs, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

#if OPAL_HAVE_HWLOC
        /* unpack and store the binding level */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &bind_level, &n, OPAL_HWLOC_LEVEL_T))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* store it */
        proc.vpid = ORTE_VPID_INVALID;
        if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_LEVEL, &bind_level, OPAL_HWLOC_LEVEL_T))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* set mine */
        if (proc.jobid == ORTE_PROC_MY_NAME->jobid) {
            orte_process_info.bind_level = bind_level;
        }

        OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                             "%s orte:util:decode:pidmap nprocs %s bind level %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_VPID_PRINT(num_procs),
                             opal_hwloc_base_print_level(bind_level)));
#endif

        /* allocate memory for the daemon info */
        daemons = (orte_vpid_t*)malloc(num_procs * sizeof(orte_vpid_t));
        /* unpack it in one shot */
        n=num_procs;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, daemons, &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* allocate memory for local ranks */
        local_rank = (orte_local_rank_t*)malloc(num_procs*sizeof(orte_local_rank_t));
        /* unpack them in one shot */
        n=num_procs;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, local_rank, &n, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (proc.jobid == ORTE_PROC_MY_NAME->jobid) {
            /* set mine */
            orte_process_info.my_local_rank = local_rank[ORTE_PROC_MY_NAME->vpid];
            if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_LOCALRANK,
                                                    &orte_process_info.my_local_rank, ORTE_LOCAL_RANK))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
        }
        
        /* allocate memory for node ranks */
        node_rank = (orte_node_rank_t*)malloc(num_procs*sizeof(orte_node_rank_t));
        /* unpack node ranks in one shot */
        n=num_procs;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, node_rank, &n, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (proc.jobid == ORTE_PROC_MY_NAME->jobid) {
            /* set mine */
            orte_process_info.my_node_rank = node_rank[ORTE_PROC_MY_NAME->vpid];
            if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_NODERANK,
                                                    &orte_process_info.my_node_rank, ORTE_NODE_RANK))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
        }
        
#if OPAL_HAVE_HWLOC
        /* allocate memory for bind_idx */
        bind_idx = (unsigned int*)malloc(num_procs*sizeof(unsigned int));
        /* unpack bind_idx in one shot */
        n=num_procs;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, bind_idx, &n, OPAL_UINT))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (proc.jobid == ORTE_PROC_MY_NAME->jobid) {
            /* set mine */
            orte_process_info.bind_idx = bind_idx[ORTE_PROC_MY_NAME->vpid];
            if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_BIND_INDEX,
                                                    &orte_process_info.bind_idx, OPAL_UINT))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
        }
#endif

        /* allocate memory for states */
        states = (orte_proc_state_t*)malloc(num_procs*sizeof(orte_proc_state_t));
        /* unpack states in one shot */
        n=num_procs;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, states, &n, ORTE_PROC_STATE))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* dump this info - apps don't need it */
        free(states);
        states = NULL;

        /* allocate memory for app_idx's */
        app_idx = (orte_app_idx_t*)malloc(num_procs*sizeof(orte_app_idx_t));
        /* unpack app_idx's in one shot */
        n=num_procs;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, app_idx, &n, ORTE_APP_IDX))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* dump this info - apps don't need it */
        free(app_idx);
        app_idx = NULL;

        /* allocate memory for restarts */
        restarts = (int32_t*)malloc(num_procs*sizeof(int32_t));
        /* unpack restarts in one shot */
        n=num_procs;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, restarts, &n, OPAL_INT32))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        /* dump this info - apps don't need it */
        free(restarts);
        restarts = NULL;

        /* set the daemon jobid */
        dmn.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid);

        /* xfer the data */
        for (i=0; i < num_procs; i++) {
            if (proc.jobid == ORTE_PROC_MY_NAME->jobid &&
                i == ORTE_PROC_MY_NAME->vpid) {
                continue;
            }
            proc.vpid = i;
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &daemons[i], ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* lookup and store the hostname for this proc */
            dmn.vpid = daemons[i];
            if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank[i], ORTE_LOCAL_RANK))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank[i], ORTE_NODE_RANK))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
#if OPAL_HAVE_HWLOC
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_INDEX, &bind_idx[i], OPAL_UINT))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            OPAL_OUTPUT_VERBOSE((10, orte_nidmap_output,
                                 "%s orte:util:decode:pidmap proc %s host %s lrank %d nrank %d bindidx %u",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&proc), hostname,
                                 (int)local_rank[i], (int)node_rank[i], bind_idx[i]));
#endif
        }
        /* release data */
        free(daemons);
        daemons = NULL;
        free(local_rank);
        local_rank = NULL;
        free(node_rank);
        node_rank = NULL;
#if OPAL_HAVE_HWLOC
        free(bind_idx);
        bind_idx = NULL;
#endif
        /* setup for next cycle */
        n = 1;
    }
    if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    rc = ORTE_SUCCESS;

    /* now that we have all the data, we are guaranteed
     * to know our own node, so go back and record the
     * locality of each proc relative to me
     */
    while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&jobs))) {
        proc.jobid = nm->name.jobid;
        /* recover the number of procs in this job */
        vptr = &num_procs;
        proc.vpid = ORTE_VPID_INVALID;
        if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_NPROCS, (void**)&vptr, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        for (i=0; i < num_procs; i++) {
            if (ORTE_PROC_MY_NAME->vpid == i &&
                ORTE_PROC_MY_NAME->jobid == proc.jobid) {
                /* this is me */
                continue;
            }
            proc.vpid = i;
            /* recover the daemon for this proc */
            vptr = &daemon;
            if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_DAEMON_VPID, (void**)&vptr, ORTE_VPID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (daemon == ORTE_PROC_MY_DAEMON->vpid) {
                OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                                     "%s orte:util:decode:pidmap proc %s shares node",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&proc)));
                /* we share a node, so add them to the count of peers
                 * sharing the node with me
                 */
                orte_process_info.num_local_peers++;
#if OPAL_HAVE_HWLOC
                /* retrieve the bind level for the other proc's job */
                lvptr = &pbind;
                proc.vpid = ORTE_VPID_INVALID;
                if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_LEVEL, (void**)&lvptr, OPAL_HWLOC_LEVEL_T))) {
                    ORTE_ERROR_LOG(rc);
                    goto cleanup;
                }

                /* retrieve the other's proc's bind idx */
                uiptr = &pbidx;
                proc.vpid = i;
                if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_INDEX, (void**)&uiptr, OPAL_UINT))) {
                    ORTE_ERROR_LOG(rc);
                    goto cleanup;
                }

                /* we share a node - see what else we share */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 orte_process_info.bind_level,
                                                                 orte_process_info.bind_idx,
                                                                 pbind, pbidx);
#else
                locality = OPAL_PROC_ON_NODE;
#endif
            } else {
                /* we don't share a node */
                OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                                     "%s orte:util:decode:pidmap proc %s does NOT node [my daemon %s, their daemon %s]",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&proc),
                                     ORTE_VPID_PRINT(ORTE_PROC_MY_DAEMON->vpid),
                                     ORTE_VPID_PRINT(daemon)));
                locality = OPAL_PROC_NON_LOCAL;
            }
            /* store the locality */
                OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output,
                                     "%s orte:util:decode:pidmap set proc %s locality to %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&proc),
                                     opal_hwloc_base_print_locality(locality)));
            if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
        }
    }
        
    
 cleanup:
    if (NULL != daemons) {
        free(daemons);
    }
    if (NULL != local_rank) {
        free(local_rank);
    }
    if (NULL != node_rank) {
        free(node_rank);
    }
#if OPAL_HAVE_HWLOC
    if (NULL != bind_idx) {
        free(bind_idx);
    }
#endif
    if (NULL != states) {
        free(states);
    }
    if (NULL != app_idx) {
        free(app_idx);
    }
    if (NULL != restarts) {
        free(restarts);
    }
    OBJ_DESTRUCT(&buf);
    return rc;
}
void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata)
{
    int rc, cnt;
    orte_process_name_t pname;
    char *hostname;
    orte_vpid_t daemon;
    orte_node_rank_t node_rank;
    orte_local_rank_t local_rank;
    orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata;
    opal_hwloc_locality_t locality;

    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                         "%s STORING PEER MODEX DATA",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* unpack the process name */
    cnt=1;
    while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &pname, &cnt, ORTE_NAME))) {
        /* unpack and store the hostname */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &hostname, &cnt, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* unpack and store the daemon vpid */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &daemon, &cnt, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_DAEMON_VPID, &daemon, OPAL_UINT32))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* unpack and store the node rank */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        /* unpack the local rank */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* compute the locality and store in the database */
#if OPAL_HAVE_HWLOC
        {
            char *cpuset;

            /* unpack and store the cpuset - could be NULL */
            cnt = 1;
            if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &cpuset, &cnt, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_CPUSET, cpuset, OPAL_STRING))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                                 "%s store:peer:modex setting proc %s cpuset %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pname), cpuset));

            if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &pname, ORTE_PROC_MY_NAME)) {
                /* if this data is from myself, then set locality to all */
                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                     "%s store:peer:modex setting proc %s locale ALL",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pname)));
                locality = OPAL_PROC_ALL_LOCAL;
            } else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
                /* this is on a different node, then mark as non-local */
                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                     "%s store:peer:modex setting proc %s locale NONLOCAL",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pname)));
                locality = OPAL_PROC_NON_LOCAL;
            } else if (NULL == cpuset || NULL == orte_process_info.cpuset) {
                /* one or both of us is not bound, so all we can say is we are on the
                 * same node
                 */
                locality = OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 orte_process_info.cpuset,
                                                                 cpuset);
                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                     "%s store:peer:modex setting proc %s locale %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&pname),
                                     opal_hwloc_base_print_locality(locality)));
            }
        }
#else
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &pname, ORTE_PROC_MY_NAME)) {
            /* if this data is from myself, then set locality to all */
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s grpcomm:base:modex setting proc %s locale ALL",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pname)));
            locality = OPAL_PROC_ALL_LOCAL;
        } else if (daemon != ORTE_PROC_MY_DAEMON->vpid) {
            /* this is on a different node, then mark as non-local */
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                 "%s store:peer:modex setting proc %s locale NONLOCAL",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pname)));
            locality = OPAL_PROC_NON_LOCAL;
        } else {
            /* must be on our node */
            locality = OPAL_PROC_ON_NODE;
        }
#endif
        if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s store:peer:modex: adding modex entry for proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&pname)));
        
        /* update the modex database */
        if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&pname, rbuf))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }            
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s store:peer:modex: completed modex entry for proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&pname)));
    }    

 cleanup:
    /* flag the collective as complete */
    modex->active = false;
    /* cleanup the list, but don't release the
     * collective object as it was passed into us
     */
    opal_list_remove_item(&orte_grpcomm_base.active_colls, &modex->super);
    /* notify that the modex is complete */
    if (NULL != modex->cbfunc) {
        OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                             "%s CALLING MODEX RELEASE",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        modex->cbfunc(NULL, modex->cbdata);
    } else {
        OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output,
                             "%s store:peer:modex NO MODEX RELEASE CBFUNC",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    }
}
Ejemplo n.º 9
0
static void fencenb(int sd, short args, void *cbdata)
{
    pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
    int rc = OPAL_SUCCESS;
    int32_t i;
    opal_value_t *kp, kvn;
    opal_hwloc_locality_t locality;
    opal_process_name_t pname;

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:s2 called fence",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

    /* check if there is partially filled meta key and put them */
    opal_pmix_base_commit_packed (&pmix_packed_data, &pmix_packed_data_offset,
                                  &pmix_packed_encoded_data, &pmix_packed_encoded_data_offset,
                                  pmix_vallen_max, &pmix_pack_key, kvs_put);

    /* now call fence */
    if (PMI2_SUCCESS != PMI2_KVS_Fence()) {
        rc = OPAL_ERROR;
        goto cleanup;
    }

    /* get the modex data from each local process and set the
     * localities to avoid having the MPI layer fetch data
     * for every process in the job */
    pname.jobid = OPAL_PROC_MY_NAME.jobid;
    if (!got_modex_data) {
        got_modex_data = true;
        /* we only need to set locality for each local rank as "not found"
     * equates to "non-local" */
        for (i=0; i < s2_nlranks; i++) {
            pname.vpid = s2_lranks[i];
            rc = opal_pmix_base_cache_keys_locally(&s2_pname, OPAL_PMIX_CPUSET,
                                                   &kp, pmix_kvs_name, pmix_vallen_max, kvs_get);
            if (OPAL_SUCCESS != rc) {
                OPAL_ERROR_LOG(rc);
                goto cleanup;
            }
            if (NULL == kp || NULL == kp->data.string) {
                /* if we share a node, but we don't know anything more, then
         * mark us as on the node as this is all we know
         */
                locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 opal_process_info.cpuset,
                                                                 kp->data.string);
            }
            if (NULL != kp) {
                OBJ_RELEASE(kp);
            }
            OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
                                 "%s pmix:s2 proc %s locality %s",
                                 OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                 OPAL_NAME_PRINT(s2_pname),
                                 opal_hwloc_base_print_locality(locality)));

            OBJ_CONSTRUCT(&kvn, opal_value_t);
            kvn.key = strdup(OPAL_PMIX_LOCALITY);
            kvn.type = OPAL_UINT16;
            kvn.data.uint16 = locality;
            opal_pmix_base_store(&pname, &kvn);
            OBJ_DESTRUCT(&kvn);
        }
    }

cleanup:
    if (NULL != op->opcbfunc) {
        op->opcbfunc(rc, op->cbdata);
    }
    OBJ_RELEASE(op);
    return;
}
Ejemplo n.º 10
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char *envar, *ev1, *ev2;
    uint64_t unique_key[2];
    char *string_key;
    opal_value_t *kv;
    char *val;
    int u32, *u32ptr;
    uint16_t u16, *u16ptr;
    char **peers=NULL, *mycpuset;
    opal_process_name_t wildcard_rank, pname;
    bool bool_val, *bool_ptr = &bool_val, tdir_mca_override = false;
    size_t i;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }

    /* get an async event base - we use the opal_async one so
     * we don't startup extra threads if not needed */
    orte_event_base = opal_progress_thread_init(NULL);
    progress_thread_running = true;

    /* open and setup pmix */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        /* we cannot run */
        error = "pmix init";
        goto error;
    }
    if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) {
        /* we cannot run */
        error = "pmix init";
        goto error;
    }
    /* set the event base */
    opal_pmix_base_set_evbase(orte_event_base);
    /* initialize the selected module */
    if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
        /* we cannot run - this could be due to being direct launched
         * without the required PMI support being built. Try to detect
         * that scenario and warn the user */
        if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() &&
            NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) {
            if (0 == strcmp(envar, "SLURM")) {
                /* yes to both - so emit a hopefully helpful
                 * error message and abort */
                orte_show_help_finalize();
                orte_show_help("help-ess-base.txt", "slurm-error", true);
                return ORTE_ERR_SILENT;
            } else if (0 == strcmp(envar, "ALPS")) {
                /* we were direct launched by ALPS */
                orte_show_help_finalize();
                orte_show_help("help-ess-base.txt", "alps-error", true);
                return ORTE_ERR_SILENT;
            }
        }
        error = "pmix init";
        goto error;
    }
    u32ptr = &u32;
    u16ptr = &u16;

    /****   THE FOLLOWING ARE REQUIRED VALUES   ***/
    /* pmix.init set our process name down in the OPAL layer,
     * so carry it forward here */
    ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid;
    ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid;

    /* setup a name for retrieving data associated with the job */
    wildcard_rank.jobid = ORTE_PROC_MY_NAME->jobid;
    wildcard_rank.vpid = ORTE_NAME_WILDCARD->vpid;

    /* setup a name for retrieving proc-specific data */
    pname.jobid = ORTE_PROC_MY_NAME->jobid;
    pname.vpid = 0;

    /* get our local rank from PMI */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK,
                          ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
    if (OPAL_SUCCESS != ret) {
        error = "getting local rank";
        goto error;
    }
    orte_process_info.my_local_rank = u16;

    /* get our node rank from PMI */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK,
                          ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
    if (OPAL_SUCCESS != ret) {
        error = "getting node rank";
        goto error;
    }
    orte_process_info.my_node_rank = u16;

    /* get max procs for this application */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS,
                          &wildcard_rank, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS != ret) {
        error = "getting max procs";
        goto error;
    }
    orte_process_info.max_procs = u32;

    /* get job size */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_JOB_SIZE,
                          &wildcard_rank, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS != ret) {
        error = "getting job size";
        goto error;
    }
    orte_process_info.num_procs = u32;

    /* push into the environ for pickup in MPI layer for
     * MPI-3 required info key
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) {
        asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs);
        putenv(ev1);
        added_num_procs = true;
    }
    if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
        asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs);
        putenv(ev2);
        added_app_ctx = true;
    }


    /* get our app number from PMI - ok if not found */
    OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
                                   ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS == ret) {
        orte_process_info.app_num = u32;
    } else {
        orte_process_info.app_num = 0;
    }

    /* get the number of local peers - required for wireup of
     * shared memory BTL */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE,
                          &wildcard_rank, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS == ret) {
        orte_process_info.num_local_peers = u32 - 1;  // want number besides ourselves
    } else {
        orte_process_info.num_local_peers = 0;
    }

    /* get number of nodes in the job */
    OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NUM_NODES,
                                   &wildcard_rank, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS == ret) {
        orte_process_info.num_nodes = u32;
    }

    /* setup transport keys in case the MPI layer needs them -
     * we can use the jobfam and stepid as unique keys
     * because they are unique values assigned by the RM
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
        unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
        unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid);
        if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        opal_output_verbose(2, orte_ess_base_framework.framework_output,
                            "%s transport key %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), string_key);
        asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
        putenv(envar);
        added_transport_keys = true;
        /* cannot free the envar as that messes up our environ */
        free(string_key);
    }

    /* retrieve temp directories info */
    OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING);
    if (OPAL_SUCCESS == ret && NULL != val) {
        /* We want to provide user with ability
         * to override RM settings at his own risk
         */
        if( NULL == orte_process_info.top_session_dir ){
            orte_process_info.top_session_dir = val;
        } else {
            /* keep the MCA setting */
            tdir_mca_override = true;
            free(val);
        }
        val = NULL;
    }

    if( !tdir_mca_override ){
        OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING);
        if (OPAL_SUCCESS == ret && NULL != val) {
            /* We want to provide user with ability
             * to override RM settings at his own risk
             */
            if( NULL == orte_process_info.job_session_dir ){
                orte_process_info.job_session_dir = val;
            } else {
                /* keep the MCA setting */
                free(val);
                tdir_mca_override = true;
            }
            val = NULL;
        }
    }

    if( !tdir_mca_override ){
        OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING);
        if (OPAL_SUCCESS == ret && NULL != val) {
            /* We want to provide user with ability
             * to override RM settings at his own risk
             */
            if( NULL == orte_process_info.proc_session_dir ){
                orte_process_info.proc_session_dir = val;
            } else {
                /* keep the MCA setting */
                tdir_mca_override = true;
                free(val);
            }
            val = NULL;
        }
    }

    if( !tdir_mca_override ){
        OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TDIR_RMCLEAN, &wildcard_rank, &bool_ptr, OPAL_BOOL);
        if (OPAL_SUCCESS == ret ) {
            orte_process_info.rm_session_dirs = bool_val;
        }
    }

    /* get our local peers */
    if (0 < orte_process_info.num_local_peers) {
        /* if my local rank if too high, then that's an error */
        if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) {
            ret = ORTE_ERR_BAD_PARAM;
            error = "num local peers";
            goto error;
        }
        /* retrieve the local peers */
        OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
                              &wildcard_rank, &val, OPAL_STRING);
        if (OPAL_SUCCESS == ret && NULL != val) {
            peers = opal_argv_split(val, ',');
            free(val);
        } else {
            peers = NULL;
        }
    } else {
        peers = NULL;
    }

    /* set the locality */
    if (NULL != peers) {
        /* identify our location */
        val = NULL;
        OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
                                       ORTE_PROC_MY_NAME, &val, OPAL_STRING);
        if (OPAL_SUCCESS == ret && NULL != val) {
            mycpuset = val;
        } else {
            mycpuset = NULL;
        }
        pname.jobid = ORTE_PROC_MY_NAME->jobid;
        for (i=0; NULL != peers[i]; i++) {
            pname.vpid = strtoul(peers[i], NULL, 10);
            if (pname.vpid == ORTE_PROC_MY_NAME->vpid) {
                /* we are fully local to ourselves */
                u16 = OPAL_PROC_ALL_LOCAL;
            } else {
                val = NULL;
                OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
                                               &pname, &val, OPAL_STRING);
                if (OPAL_SUCCESS == ret && NULL != val) {
                    u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
                    free(val);
                } else {
                    /* all we can say is that it shares our node */
                    u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
                }
            }
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCALITY);
            kv->type = OPAL_UINT16;
            OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
                                 "%s ess:pmi:locality: proc %s locality %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&pname), opal_hwloc_base_print_locality(u16)));
            kv->data.uint16 = u16;
            ret = opal_pmix.store_local(&pname, kv);
            if (OPAL_SUCCESS != ret) {
                error = "local store of locality";
                opal_argv_free(peers);
                if (NULL != mycpuset) {
                    free(mycpuset);
                }
                goto error;
            }
            OBJ_RELEASE(kv);
        }
        opal_argv_free(peers);
        if (NULL != mycpuset) {
            free(mycpuset);
        }
    }

    /* now that we have all required info, complete the setup */
    if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_app_setup";
        goto error;
    }

    /* setup process binding */
    if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) {
        error = "proc_binding";
        goto error;
    }

    /* this needs to be set to enable debugger use when direct launched */
    if (NULL == orte_process_info.my_daemon_uri) {
        orte_standalone_operation = true;
    }

    /* set max procs */
    if (orte_process_info.max_procs < orte_process_info.num_procs) {
        orte_process_info.max_procs = orte_process_info.num_procs;
    }

    /* push our hostname so others can find us, if they need to - the
     * native PMIx component will ignore this request as the hostname
     * is provided by the system */
    OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING);
    if (ORTE_SUCCESS != ret) {
        error = "db store hostname";
        goto error;
    }

    /* if we are an ORTE app - and not an MPI app - then
     * we need to exchange our connection info here.
     * MPI_Init has its own modex, so we don't need to do
     * two of them. However, if we don't do a modex at all,
     * then processes have no way to communicate
     *
     * NOTE: only do this when the process originally launches.
     * Cannot do this on a restart as the rest of the processes
     * in the job won't be executing this step, so we would hang
     */
    if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
        /* need to commit the data before we fence */
        opal_pmix.commit();
        opal_pmix.fence(NULL, 0);
    }

    return ORTE_SUCCESS;

 error:
    if (!progress_thread_running) {
        /* can't send the help message, so ensure it
         * comes out locally
         */
        orte_show_help_finalize();
    }
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }
    return ret;
}