Пример #1
0
int opal_hwloc_unpack(opal_buffer_t *buffer, void *dest,
                      int32_t *num_vals,
                      opal_data_type_t type)
{
    /* NOTE: hwloc defines topology_t as a pointer to a struct! */
    hwloc_topology_t t, *tarray  = (hwloc_topology_t*)dest;
    int rc=OPAL_SUCCESS, i, cnt, j;
    char *xmlbuffer;
    struct hwloc_topology_support *support;

    for (i=0, j=0; i < *num_vals; i++) {
        /* unpack the xml string */
        cnt=1;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &xmlbuffer, &cnt, OPAL_STRING))) {
            goto cleanup;
        }

        /* convert the xml */
        if (0 != hwloc_topology_init(&t)) {
            rc = OPAL_ERROR;
            free(xmlbuffer);
            goto cleanup;
        }
        if (0 != hwloc_topology_set_xmlbuffer(t, xmlbuffer, strlen(xmlbuffer))) {
            rc = OPAL_ERROR;
            free(xmlbuffer);
            hwloc_topology_destroy(t);
            goto cleanup;
        }
        free(xmlbuffer);
        /* since we are loading this from an external source, we have to
         * explicitly set a flag so hwloc sets things up correctly
         */
        if (0 != hwloc_topology_set_flags(t, (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
                                              HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
                                              HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
            rc = OPAL_ERROR;
            hwloc_topology_destroy(t);
            goto cleanup;
        }
        /* now load the topology */
        if (0 != hwloc_topology_load(t)) {
            rc = OPAL_ERROR;
            hwloc_topology_destroy(t);
            goto cleanup;
        }

        /* get the available support - hwloc unfortunately does
         * not include this info in its xml import!
         */
        support = (struct hwloc_topology_support*)hwloc_topology_get_support(t);
        cnt = sizeof(struct hwloc_topology_discovery_support);
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, support->discovery, &cnt, OPAL_BYTE))) {
            goto cleanup;
        }
        cnt = sizeof(struct hwloc_topology_cpubind_support);
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, support->cpubind, &cnt, OPAL_BYTE))) {
            goto cleanup;
        }
        cnt = sizeof(struct hwloc_topology_membind_support);
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, support->membind, &cnt, OPAL_BYTE))) {
            goto cleanup;
        }

        /* filter the cpus thru any default cpu set */
        if (OPAL_SUCCESS != (rc = opal_hwloc_base_filter_cpus(t))) {
            goto cleanup;
        }

        /* pass it back */
        tarray[i] = t;

        /* track the number added */
        j++;
    }

 cleanup:
    *num_vals = j;
    return rc;
}
Пример #2
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char *envar, *ev1, *ev2;
    uint64_t unique_key[2];
    char *string_key;
    char *rmluri;
    opal_value_t *kv;
    char *val;
    int u32, *u32ptr;
    uint16_t u16, *u16ptr;
    char **peers=NULL, *mycpuset, **cpusets=NULL;
    opal_process_name_t name;
    size_t i;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }

    /* get an async event base - we use the opal_async one so
     * we don't startup extra threads if not needed */
    orte_event_base = opal_progress_thread_init(NULL);
    progress_thread_running = true;

    /* open and setup pmix */
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        /* we cannot run */
        error = "pmix init";
        goto error;
    }
    if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) {
        /* we cannot run */
        error = "pmix init";
        goto error;
    }
    /* set the event base */
    opal_pmix_base_set_evbase(orte_event_base);
    /* initialize the selected module */
    if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) {
        /* we cannot run */
        error = "pmix init";
        goto error;
    }
    u32ptr = &u32;
    u16ptr = &u16;

    /****   THE FOLLOWING ARE REQUIRED VALUES   ***/
    /* pmix.init set our process name down in the OPAL layer,
     * so carry it forward here */
    ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid;
    ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid;

    /* get our local rank from PMI */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK,
                          ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
    if (OPAL_SUCCESS != ret) {
        error = "getting local rank";
        goto error;
    }
    orte_process_info.my_local_rank = u16;

    /* get our node rank from PMI */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK,
                          ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16);
    if (OPAL_SUCCESS != ret) {
        error = "getting node rank";
        goto error;
    }
    orte_process_info.my_node_rank = u16;

    /* get max procs */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS,
                          ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS != ret) {
        error = "getting max procs";
        goto error;
    }
    orte_process_info.max_procs = u32;

    /* get job size */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_JOB_SIZE,
                          ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS != ret) {
        error = "getting job size";
        goto error;
    }
    orte_process_info.num_procs = u32;

    /* push into the environ for pickup in MPI layer for
     * MPI-3 required info key
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) {
        asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs);
        putenv(ev1);
        added_num_procs = true;
    }
    if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) {
        asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs);
        putenv(ev2);
        added_app_ctx = true;
    }


    /* get our app number from PMI - ok if not found */
    OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM,
                                   ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS == ret) {
        orte_process_info.app_num = u32;
    } else {
        orte_process_info.app_num = 0;
    }

    /* get the number of local peers - required for wireup of
     * shared memory BTL */
    OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE,
                          ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32);
    if (OPAL_SUCCESS == ret) {
        orte_process_info.num_local_peers = u32 - 1;  // want number besides ourselves
    } else {
        orte_process_info.num_local_peers = 0;
    }

    /* setup transport keys in case the MPI layer needs them -
     * we can use the jobfam and stepid as unique keys
     * because they are unique values assigned by the RM
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
        unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid);
        unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid);
        if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        opal_output_verbose(2, orte_ess_base_framework.framework_output,
                            "%s transport key %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), string_key);
        asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
        putenv(envar);
        added_transport_keys = true;
        /* cannot free the envar as that messes up our environ */
        free(string_key);
    }

    /* retrieve our topology */
    val = NULL;
    OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,
                                   ORTE_PROC_MY_NAME, &val, OPAL_STRING);
    if (OPAL_SUCCESS == ret && NULL != val) {
        /* load the topology */
        if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
            ret = OPAL_ERROR;
            free(val);
            error = "setting topology";
            goto error;
        }
        if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
            ret = OPAL_ERROR;
            free(val);
            hwloc_topology_destroy(opal_hwloc_topology);
            error = "setting topology";
            goto error;
        }
        /* since we are loading this from an external source, we have to
         * explicitly set a flag so hwloc sets things up correctly
         */
        if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
                                          (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
                                           HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
                                           HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
            ret = OPAL_ERROR;
            hwloc_topology_destroy(opal_hwloc_topology);
            free(val);
            error = "setting topology";
            goto error;
        }
        /* now load the topology */
        if (0 != hwloc_topology_load(opal_hwloc_topology)) {
            ret = OPAL_ERROR;
            hwloc_topology_destroy(opal_hwloc_topology);
            free(val);
            error = "setting topology";
            goto error;
        }
        free(val);
        /* filter the cpus thru any default cpu set */
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) {
            error = "filtering topology";
            goto error;
        }
    } else {
        /* it wasn't passed down to us, so go get it */
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
            error = "topology discovery";
            goto error;
        }
        /* push it into the PMIx database in case someone
         * tries to retrieve it so we avoid an attempt to
         * get it again */
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
        kv->type = OPAL_STRING;
        if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) {
            error = "topology export";
            goto error;
        }
        if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, kv))) {
            error = "topology store";
            goto error;
        }
        OBJ_RELEASE(kv);
    }

    /* get our local peers */
    if (0 < orte_process_info.num_local_peers) {
        /* if my local rank if too high, then that's an error */
        if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) {
            ret = ORTE_ERR_BAD_PARAM;
            error = "num local peers";
            goto error;
        }
        /* retrieve the local peers */
        OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS,
                              ORTE_PROC_MY_NAME, &val, OPAL_STRING);
        if (OPAL_SUCCESS == ret && NULL != val) {
            peers = opal_argv_split(val, ',');
            free(val);
            /* and their cpusets, if available */
            OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING);
            if (OPAL_SUCCESS == ret && NULL != val) {
                cpusets = opal_argv_split(val, ':');
                free(val);
            } else {
                cpusets = NULL;
            }
        } else {
            peers = NULL;
            cpusets = NULL;
        }
    } else {
        peers = NULL;
        cpusets = NULL;
    }

    /* set the locality */
    if (NULL != peers) {
        /* indentify our cpuset */
        if (NULL != cpusets) {
            mycpuset = cpusets[orte_process_info.my_local_rank];
        } else {
            mycpuset = NULL;
        }
        name.jobid = ORTE_PROC_MY_NAME->jobid;
        for (i=0; NULL != peers[i]; i++) {
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCALITY);
            kv->type = OPAL_UINT16;
            name.vpid = strtoul(peers[i], NULL, 10);
            if (name.vpid == ORTE_PROC_MY_NAME->vpid) {
                /* we are fully local to ourselves */
                u16 = OPAL_PROC_ALL_LOCAL;
            } else if (NULL == mycpuset || NULL == cpusets[i] ||
                       0 == strcmp(cpusets[i], "UNBOUND")) {
                /* all we can say is that it shares our node */
                u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
            } else {
                /* we have it, so compute the locality */
                u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, mycpuset, cpusets[i]);
            }
            OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
                                 "%s ess:pmi:locality: proc %s locality %x",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&name), u16));
            kv->data.uint16 = u16;
            ret = opal_pmix.store_local(&name, kv);
            if (OPAL_SUCCESS != ret) {
                error = "local store of locality";
                opal_argv_free(peers);
                opal_argv_free(cpusets);
                goto error;
            }
            OBJ_RELEASE(kv);
        }
        opal_argv_free(peers);
        opal_argv_free(cpusets);
    }

    /* now that we have all required info, complete the setup */
    if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_app_setup";
        goto error;
    }

    /* setup process binding */
    if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) {
        error = "proc_binding";
        goto error;
    }

    /* this needs to be set to enable debugger use when direct launched */
    if (NULL == orte_process_info.my_daemon_uri) {
        orte_standalone_operation = true;
    }

    /* set max procs */
    if (orte_process_info.max_procs < orte_process_info.num_procs) {
        orte_process_info.max_procs = orte_process_info.num_procs;
    }

    /***  PUSH DATA FOR OTHERS TO FIND   ***/

    /* push our RML URI in case others need to talk directly to us */
    rmluri = orte_rml.get_contact_info();
    /* push it out for others to use */
    OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_PROC_URI, rmluri, OPAL_STRING);
    if (ORTE_SUCCESS != ret) {
        error = "pmix put uri";
        goto error;
    }
    free(rmluri);

    /* push our hostname so others can find us, if they need to */
    OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING);
    if (ORTE_SUCCESS != ret) {
        error = "db store hostname";
        goto error;
    }

    /* if we are an ORTE app - and not an MPI app - then
     * we need to exchange our connection info here.
     * MPI_Init has its own modex, so we don't need to do
     * two of them. However, if we don't do a modex at all,
     * then processes have no way to communicate
     *
     * NOTE: only do this when the process originally launches.
     * Cannot do this on a restart as the rest of the processes
     * in the job won't be executing this step, so we would hang
     */
    if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
        opal_pmix.fence(NULL, 0);
    }

    return ORTE_SUCCESS;

error:
    if (!progress_thread_running) {
        /* can't send the help message, so ensure it
         * comes out locally
         */
        orte_show_help_finalize();
    }
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }
    return ret;
}
Пример #3
0
static int allocate(orte_job_t *jdata, opal_list_t *nodes)
{
    int i, n, val, dig, num_nodes;
    orte_node_t *node;
#if OPAL_HAVE_HWLOC
    orte_topology_t *t;
    hwloc_topology_t topo;
    hwloc_obj_t obj;
    unsigned j, k;
    struct hwloc_topology_support *support;
    char **files=NULL;
    char **topos = NULL;
    bool use_local_topology = false;
#endif
    char **node_cnt=NULL;
    char **slot_cnt=NULL;
    char **max_slot_cnt=NULL;
    char *tmp;
    char prefix[6];

    node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
    if (NULL != mca_ras_simulator_component.slots) {
        slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
        /* backfile the slot_cnt so every topology has a cnt */
        tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
        for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
            opal_argv_append_nosize(&slot_cnt, tmp);
        }
    }
    if (NULL != mca_ras_simulator_component.slots_max) {
        max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
        /* backfill the max_slot_cnt as reqd */
        tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
        for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
            opal_argv_append_nosize(&max_slot_cnt, tmp);
        }
    }
 
#if OPAL_HAVE_HWLOC
    if (NULL != mca_ras_simulator_component.topofiles) {
        files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
        if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
            orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
            goto error_silent;
        }
    } else if (NULL != mca_ras_simulator_component.topologies) {
        topos = opal_argv_split(mca_ras_simulator_component.topologies, ',');
        if (opal_argv_count(topos) != opal_argv_count(node_cnt)) {
            orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
            goto error_silent;
        }
    } else {
        /* use our topology */
        use_local_topology = true;
    }
#else
    /* If we don't have hwloc and hwloc files were specified, then
       error out (because we can't deliver that functionality) */
    if (NULL == mca_ras_simulator_component.topofiles) {
        orte_show_help("help-ras-simulator.txt",
                       "no hwloc support for topofiles", true);
        goto error_silent;
    }
#endif

    /* setup the prefix to the node names */
    snprintf(prefix, 6, "nodeA");

    /* process the request */
    for (n=0; NULL != node_cnt[n]; n++) {
        num_nodes = strtol(node_cnt[n], NULL, 10);

        /* get number of digits */
        val = num_nodes;
        for (dig=0; 0 != val; dig++) {
            val /= 10;
        }

        /* set the prefix for this group of nodes */
        prefix[4] += n;

        /* check for topology */
#if OPAL_HAVE_HWLOC
        if (use_local_topology) {
            /* use our topology */
            topo = opal_hwloc_topology;
        } else if (NULL != files) {
            if (0 != hwloc_topology_init(&topo)) {
                orte_show_help("help-ras-simulator.txt",
                               "hwloc API fail", true,
                               __FILE__, __LINE__, "hwloc_topology_init");
                goto error_silent;
            }
            if (0 != hwloc_topology_set_xml(topo, files[n])) {
                orte_show_help("help-ras-simulator.txt",
                               "hwloc failed to load xml", true, files[n]);
                hwloc_topology_destroy(topo);
                goto error_silent;
            }
            /* since we are loading this from an external source, we have to
             * explicitly set a flag so hwloc sets things up correctly
             */
            if (0 != hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)) {
                orte_show_help("help-ras-simulator.txt",
                               "hwloc API fail", true,
                               __FILE__, __LINE__, "hwloc_topology_set_flags");
                hwloc_topology_destroy(topo);
                goto error_silent;
            }
            if (0 != hwloc_topology_load(topo)) {
                orte_show_help("help-ras-simulator.txt",
                               "hwloc API fail", true,
                               __FILE__, __LINE__, "hwloc_topology_load");
                hwloc_topology_destroy(topo);
                goto error_silent;
            }
            /* remove the hostname from the topology. Unfortunately, hwloc
             * decided to add the source hostname to the "topology", thus
             * rendering it unusable as a pure topological description. So
             * we remove that information here.
             */
            obj = hwloc_get_root_obj(topo);
            for (k=0; k < obj->infos_count; k++) {
                if (NULL == obj->infos[k].name ||
                    NULL == obj->infos[k].value) {
                    continue;
                }
                if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
                    free(obj->infos[k].name);
                    free(obj->infos[k].value);
                    /* left justify the array */
                    for (j=k; j < obj->infos_count-1; j++) {
                        obj->infos[j] = obj->infos[j+1];
                    }
                    obj->infos[obj->infos_count-1].name = NULL;
                    obj->infos[obj->infos_count-1].value = NULL;
                    obj->infos_count--;
                    break;
                }
            }
            /* unfortunately, hwloc does not include support info in its
             * xml output :-(( To aid in debugging, we set it here
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
            support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
            support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
            /* add it to our array */
            t = OBJ_NEW(orte_topology_t);
            t->topo = topo;
            t->sig = opal_hwloc_base_get_topo_signature(topo);
            opal_pointer_array_add(orte_node_topologies, t);
        } else {
            if (0 != hwloc_topology_init(&topo)) {
                orte_show_help("help-ras-simulator.txt",
                               "hwloc API fail", true,
                               __FILE__, __LINE__, "hwloc_topology_init");
                goto error_silent;
            }
            if (0 != hwloc_topology_set_synthetic(topo, topos[n])) {
                orte_show_help("help-ras-simulator.txt",
                               "hwloc API fail", true,
                               __FILE__, __LINE__, "hwloc_topology_set_synthetic");
                hwloc_topology_destroy(topo);
                goto error_silent;
            }
            if (0 != hwloc_topology_load(topo)) {
                orte_show_help("help-ras-simulator.txt",
                               "hwloc API fail", true,
                               __FILE__, __LINE__, "hwloc_topology_load");
                hwloc_topology_destroy(topo);
                goto error_silent;
            }
            if (OPAL_SUCCESS != opal_hwloc_base_filter_cpus(topo)) {
                orte_show_help("help-ras-simulator.txt",
                               "hwloc API fail", true,
                               __FILE__, __LINE__, "opal_hwloc_base_filter_cpus");
                hwloc_topology_destroy(topo);
                goto error_silent;
            }
            /* remove the hostname from the topology. Unfortunately, hwloc
             * decided to add the source hostname to the "topology", thus
             * rendering it unusable as a pure topological description. So
             * we remove that information here.
             */
            obj = hwloc_get_root_obj(topo);
            for (k=0; k < obj->infos_count; k++) {
                if (NULL == obj->infos[k].name ||
                    NULL == obj->infos[k].value) {
                    continue;
                }
                if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
                    free(obj->infos[k].name);
                    free(obj->infos[k].value);
                    /* left justify the array */
                    for (j=k; j < obj->infos_count-1; j++) {
                        obj->infos[j] = obj->infos[j+1];
                    }
                    obj->infos[obj->infos_count-1].name = NULL;
                    obj->infos[obj->infos_count-1].value = NULL;
                    obj->infos_count--;
                    break;
                }
            }
            /* unfortunately, hwloc does not include support info in its
             * xml output :-(( To aid in debugging, we set it here
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
            support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
            support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
            /* add it to our array */
            t = OBJ_NEW(orte_topology_t);
            t->topo = topo;
            t->sig = opal_hwloc_base_get_topo_signature(topo);
            opal_pointer_array_add(orte_node_topologies, t);
        }
#endif

        for (i=0; i < num_nodes; i++) {
            node = OBJ_NEW(orte_node_t);
            asprintf(&node->name, "%s%0*d", prefix, dig, i);
            node->state = ORTE_NODE_STATE_UP;
            node->slots_inuse = 0;
#if OPAL_HAVE_HWLOC
            if (NULL == max_slot_cnt || NULL == max_slot_cnt[n]) {
                node->slots_max = 0;
            } else {
                obj = hwloc_get_root_obj(topo);
                node->slots_max = opal_hwloc_base_get_npus(topo, obj);
            }
            if (NULL == slot_cnt || NULL == slot_cnt[n]) {
                node->slots = 0;
            } else {
                obj = hwloc_get_root_obj(topo);
                node->slots = opal_hwloc_base_get_npus(topo, obj);
            }
            node->topology = topo;
#endif
            opal_output_verbose(1, orte_ras_base_framework.framework_output,
                                "Created Node <%10s> [%3d : %3d]",
                                node->name, node->slots, node->slots_max);
            opal_list_append(nodes, &node->super);
        }
    }

    /* record the number of allocated nodes */
    orte_num_allocated_nodes = opal_list_get_size(nodes);

    if (NULL != max_slot_cnt) {
        opal_argv_free(max_slot_cnt);
    }
    if (NULL != slot_cnt) {
        opal_argv_free(slot_cnt);
    }
    if (NULL != node_cnt) {
        opal_argv_free(node_cnt);
    }

    return ORTE_SUCCESS;

error_silent:
    if (NULL != max_slot_cnt) {
        opal_argv_free(max_slot_cnt);
    }
    if (NULL != slot_cnt) {
        opal_argv_free(slot_cnt);
    }
    if (NULL != node_cnt) {
        opal_argv_free(node_cnt);
    }
    return ORTE_ERR_SILENT;

}
Пример #4
0
int main(int argc, char* argv[])
{
    hwloc_obj_t obj;
    unsigned j, k;
    struct hwloc_topology_support *support;
    int rc;

    if (2 != argc) {
        fprintf(stderr, "Usage: opal_hwloc <topofile>\n");
        exit(1);
    }

    if (0 > (rc = opal_init(&argc, &argv))) {
        fprintf(stderr, "opal_hwloc: couldn't init opal - error code %d\n", rc);
        return rc;
    }

    if (0 != hwloc_topology_init(&my_topology)) {
        return OPAL_ERR_NOT_SUPPORTED;
    }
    if (0 != hwloc_topology_set_xml(my_topology, argv[1])) {
        hwloc_topology_destroy(my_topology);
        return OPAL_ERR_NOT_SUPPORTED;
    }
    /* since we are loading this from an external source, we have to
     * explicitly set a flag so hwloc sets things up correctly
     */
    if (0 != hwloc_topology_set_flags(my_topology,
                                      (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
                                       HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
                                       HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
        hwloc_topology_destroy(my_topology);
        return OPAL_ERR_NOT_SUPPORTED;
    }
    if (0 != hwloc_topology_load(my_topology)) {
        hwloc_topology_destroy(my_topology);
        return OPAL_ERR_NOT_SUPPORTED;
    }
    /* remove the hostname from the topology. Unfortunately, hwloc
     * decided to add the source hostname to the "topology", thus
     * rendering it unusable as a pure topological description. So
     * we remove that information here.
     */
    obj = hwloc_get_root_obj(my_topology);
    for (k=0; k < obj->infos_count; k++) {
        if (NULL == obj->infos[k].name ||
            NULL == obj->infos[k].value) {
            continue;
        }
        if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
            free(obj->infos[k].name);
            free(obj->infos[k].value);
            /* left justify the array */
            for (j=k; j < obj->infos_count-1; j++) {
                obj->infos[j] = obj->infos[j+1];
            }
            obj->infos[obj->infos_count-1].name = NULL;
            obj->infos[obj->infos_count-1].value = NULL;
            obj->infos_count--;
            break;
        }
    }
    /* unfortunately, hwloc does not include support info in its
     * xml output :-(( We default to assuming it is present as
     * systems that use this option are likely to provide
     * binding support
     */
    support = (struct hwloc_topology_support*)hwloc_topology_get_support(my_topology);
    support->cpubind->set_thisproc_cpubind = true;

    /* filter the cpus thru any default cpu set */
    opal_hwloc_base_filter_cpus(my_topology);

    /* fill opal_cache_line_size global with the smallest L1 cache
       line size */
    fill_cache_line_size();

    /* test it */
    if (NULL == hwloc_get_obj_by_type(my_topology, HWLOC_OBJ_CORE, 0)) {
        fprintf(stderr, "DIDN'T FIND A CORE\n");
    }

    hwloc_topology_destroy(my_topology);

    opal_finalize();

    return 0;
}