/*** MODEX SECTION ***/ static int modex(orte_grpcomm_collective_t *coll) { int *local_ranks, local_rank_count; opal_hwloc_locality_t locality; const char *cpuset; orte_process_name_t name; orte_vpid_t v; bool local; int rc, i; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:pmi: modex entered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* discover the local ranks */ #if WANT_PMI2_SUPPORT { char *pmapping = (char*)malloc(PMI2_MAX_VALLEN); int found, sid, nodes, k; orte_vpid_t n; char *p; rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found); if (!found || PMI_SUCCESS != rc) { /* can't check PMI2_SUCCESS as some folks (i.e., Cray) don't define it */ opal_output(0, "%s could not get PMI_process_mapping (PMI2_Info_GetJobAttr() failed)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } i = 0; n = 0; local_rank_count = 0; if (NULL != (p = strstr(pmapping, "(vector"))) { while (NULL != (p = strstr(p+1, ",("))) { if (3 == sscanf(p, ",(%d,%d,%d)", &sid, &nodes, &local_rank_count)) { for (k = 0; k < nodes; k++) { if ((ORTE_PROC_MY_NAME->vpid >= n) && (ORTE_PROC_MY_NAME->vpid < (n + local_rank_count))) { break; } n += local_rank_count; } } } } free(pmapping); if (local_rank_count > 0) { local_ranks = (int*)malloc(local_rank_count * sizeof(int)); for (i=0; i < local_rank_count; i++) { local_ranks[i] = n + i; } } if (NULL == local_ranks) { opal_output(0, "%s could not get PMI_process_mapping", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } } #else rc = PMI_Get_clique_size (&local_rank_count); if (PMI_SUCCESS != rc) { ORTE_ERROR_LOG(ORTE_ERROR); return ORTE_ERROR; } local_ranks = calloc (local_rank_count, sizeof (int)); if (NULL == local_ranks) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } rc = PMI_Get_clique_ranks (local_ranks, local_rank_count); if (PMI_SUCCESS != rc) { ORTE_ERROR_LOG(ORTE_ERROR); return ORTE_ERROR; } #endif /* our RTE data was constructed and pushed in the ESS pmi component */ /* commit our modex info */ opal_db.commit((opal_identifier_t *)ORTE_PROC_MY_NAME); /* cycle thru all my peers and collect their RTE info */ name.jobid = ORTE_PROC_MY_NAME->jobid; for (v=0; v < orte_process_info.num_procs; v++) { if (v == ORTE_PROC_MY_NAME->vpid) { continue; } name.vpid = v; /* check if this is a local process */ for (i = 0, local = false ; i < local_rank_count ; ++i) { if ((orte_vpid_t) local_ranks[i] == v) { local = true; break; } } /* compute and store the locality as it isn't something that gets pushed to PMI - doing * this here will prevent the MPI layer from fetching data for all ranks */ if (local) { if (ORTE_SUCCESS != (rc = opal_db.fetch_pointer((opal_identifier_t*)&name, OPAL_DB_CPUSET, (void **)&cpuset, OPAL_STRING))) { ORTE_ERROR_LOG(rc); return rc; } if (NULL == cpuset) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, orte_process_info.cpuset, (char *) cpuset); } } else { /* this is on a different node, then mark as non-local */ locality = OPAL_PROC_NON_LOCAL; } OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:pmi proc %s locality %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), opal_hwloc_base_print_locality(locality))); if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_SCOPE_INTERNAL, OPAL_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) { ORTE_ERROR_LOG(rc); return rc; } } /* execute the callback */ coll->active = false; if (NULL != coll->cbfunc) { coll->cbfunc(NULL, coll->cbdata); } return rc; }
static int cray_fence(opal_process_name_t *procs, size_t nprocs) { int rc; int32_t i; opal_value_t *kp, kvn; opal_hwloc_locality_t locality; opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray called fence", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* check if there is partially filled meta key and put them */ if (0 != pmix_packed_data_offset && NULL != pmix_packed_data) { opal_pmix_base_commit_packed(pmix_packed_data, pmix_packed_data_offset, pmix_vallen_max, &pmix_pack_key, kvs_put); pmix_packed_data_offset = 0; free(pmix_packed_data); pmix_packed_data = NULL; } if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) { OPAL_PMI_ERROR(rc, "PMI2_KVS_Fence"); return OPAL_ERROR; } opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray kvs_fence complete", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* get the modex data from each local process and set the * localities to avoid having the MPI layer fetch data * for every process in the job */ if (!pmix_got_modex_data) { pmix_got_modex_data = true; /* we only need to set locality for each local rank as "not found" * equates to "non-local" */ for (i=0; i < pmix_nlranks; i++) { pmix_pname.vid = pmix_lranks[i]; rc = opal_pmix_base_cache_keys_locally((opal_identifier_t*)&pmix_pname, OPAL_DSTORE_CPUSET, &kp, pmix_kvs_name, pmix_vallen_max, kvs_get); if (OPAL_SUCCESS != rc) { OPAL_ERROR_LOG(rc); return rc; } #if OPAL_HAVE_HWLOC if (NULL == kp || NULL == kp->data.string) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, opal_process_info.cpuset, kp->data.string); } if (NULL != kp) { OBJ_RELEASE(kp); } #else /* all we know is we share a node */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; #endif OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:s2 proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(*(opal_identifier_t*)&pmix_pname), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; (void)opal_dstore.store(opal_dstore_internal, (opal_identifier_t*)&pmix_pname, &kvn); OBJ_DESTRUCT(&kvn); } } return OPAL_SUCCESS; }
static int s1_fence(opal_list_t *procs, int collect_data) { int rc; int32_t i; opal_value_t *kp, kvn; opal_hwloc_locality_t locality; opal_process_name_t s1_pname; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s1 called fence", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* use the PMI barrier function */ if (PMI_SUCCESS != (rc = PMI_Barrier())) { OPAL_PMI_ERROR(rc, "PMI_Barrier"); return OPAL_ERROR; } opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s1 barrier complete", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* get the modex data from each local process and set the * localities to avoid having the MPI layer fetch data * for every process in the job */ s1_pname.jobid = OPAL_PROC_MY_NAME.jobid; if (!got_modex_data) { got_modex_data = true; /* we only need to set locality for each local rank as "not found" * equates to "non-local" */ for (i=0; i < nlranks; i++) { s1_pname.vpid = lranks[i]; rc = opal_pmix_base_cache_keys_locally(&s1_pname, OPAL_PMIX_CPUSET, &kp, pmix_kvs_name, pmix_vallen_max, kvs_get); if (OPAL_SUCCESS != rc) { OPAL_ERROR_LOG(rc); return rc; } if (NULL == kp || NULL == kp->data.string) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, opal_process_info.cpuset, kp->data.string); } if (NULL != kp) { OBJ_RELEASE(kp); } OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:s1 proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(s1_pname), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_PMIX_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; opal_pmix_base_store(&s1_pname, &kvn); OBJ_DESTRUCT(&kvn); } } return OPAL_SUCCESS; }
static int cray_fence(opal_list_t *procs, int collect_data) { int rc, cnt; int32_t i; int *all_lens = NULL; opal_value_t *kp, kvn; opal_buffer_t *send_buffer = NULL; opal_buffer_t *buf = NULL; void *sbuf_ptr; char *cptr, *rcv_buff = NULL; opal_process_name_t id; typedef struct { uint32_t pmix_rank; opal_process_name_t name; int32_t nbytes; } bytes_and_rank_t; int32_t rcv_nbytes_tot; bytes_and_rank_t s_bytes_and_rank; bytes_and_rank_t *r_bytes_and_ranks = NULL; opal_hwloc_locality_t locality; opal_list_t vals; char *cpuset = NULL; opal_process_name_t pname; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:cray executing fence cache_global %p cache_local %p", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (void *)mca_pmix_cray_component.cache_global, (void *)mca_pmix_cray_component.cache_local); /* get the modex data from each local process and set the * localities to avoid having the MPI layer fetch data * for every process in the job */ pname.jobid = OPAL_PROC_MY_NAME.jobid; /* * "unload" the cache_local/cache_global buffers, first copy * it so we can continue to use the local buffers if further * calls to put can be made */ send_buffer = OBJ_NEW(opal_buffer_t); if (NULL == send_buffer) { return OPAL_ERR_OUT_OF_RESOURCE; } opal_dss.copy_payload(send_buffer, mca_pmix_cray_component.cache_global); opal_dss.unload(send_buffer, &sbuf_ptr, &s_bytes_and_rank.nbytes); s_bytes_and_rank.pmix_rank = pmix_rank; s_bytes_and_rank.name = OPAL_PROC_MY_NAME; r_bytes_and_ranks = (bytes_and_rank_t *)malloc(pmix_size * sizeof(bytes_and_rank_t)); if (NULL == r_bytes_and_ranks) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto fn_exit; } /* * gather up all the buffer sizes and rank order. * doing this step below since the cray pmi PMI_Allgather doesn't deliver * the gathered data necessarily in PMI rank order, although the order stays * the same for the duration of a job - assuming no node failures. */ if (PMI_SUCCESS != (rc = PMI_Allgather(&s_bytes_and_rank,r_bytes_and_ranks,sizeof(bytes_and_rank_t)))) { OPAL_PMI_ERROR(rc,"PMI_Allgather"); rc = OPAL_ERR_COMM_FAILURE; goto fn_exit; } for (rcv_nbytes_tot=0,i=0; i < pmix_size; i++) { rcv_nbytes_tot += r_bytes_and_ranks[i].nbytes; } opal_output_verbose(20, opal_pmix_base_framework.framework_output, "%s pmix:cray total number of bytes to receive %d", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), rcv_nbytes_tot); rcv_buff = (char *) malloc(rcv_nbytes_tot * sizeof(char)); if (NULL == rcv_buff) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto fn_exit; } all_lens = (int *)malloc(sizeof(int) * pmix_size); if (NULL == all_lens) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto fn_exit; } for (i=0; i< pmix_size; i++) { all_lens[r_bytes_and_ranks[i].pmix_rank] = r_bytes_and_ranks[i].nbytes; } if (PMI_SUCCESS != (rc = PMI_Allgatherv(sbuf_ptr,s_bytes_and_rank.nbytes,rcv_buff,all_lens))) { OPAL_PMI_ERROR(rc,"PMI_Allgatherv"); rc = OPAL_ERR_COMM_FAILURE; goto fn_exit; } OBJ_RELEASE(send_buffer); send_buffer = NULL; buf = OBJ_NEW(opal_buffer_t); if (buf == NULL) { rc = OPAL_ERR_OUT_OF_RESOURCE; goto fn_exit; } for (cptr = rcv_buff, i=0; i < pmix_size; i++) { id = r_bytes_and_ranks[i].name; buf->base_ptr = NULL; /* TODO: ugh */ if (OPAL_SUCCESS != (rc = opal_dss.load(buf, (void *)cptr, r_bytes_and_ranks[i].nbytes))) { OPAL_PMI_ERROR(rc,"pmix:cray opal_dss.load failed"); goto fn_exit; } /* unpack and stuff in to the dstore */ cnt = 1; while (OPAL_SUCCESS == (rc = opal_dss.unpack(buf, &kp, &cnt, OPAL_VALUE))) { opal_output_verbose(20, opal_pmix_base_framework.framework_output, "%s pmix:cray unpacked kp with key %s type(%d) for id %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key, kp->type, OPAL_NAME_PRINT(id)); if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&id, kp))) { OPAL_ERROR_LOG(rc); goto fn_exit; } OBJ_RELEASE(kp); cnt = 1; } cptr += r_bytes_and_ranks[i].nbytes; } buf->base_ptr = NULL; /* TODO: ugh */ OBJ_RELEASE(buf); opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:cray kvs_fence complete", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); #if OPAL_HAVE_HWLOC /* fetch my cpuset */ OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS == (rc = opal_pmix_base_fetch(&pmix_pname, OPAL_PMIX_CPUSET, &vals))) { kp = (opal_value_t*)opal_list_get_first(&vals); cpuset = strdup(kp->data.string); } else { cpuset = NULL; } OPAL_LIST_DESTRUCT(&vals); #endif /* we only need to set locality for each local rank as "not found" * equates to "non-local" */ for (i=0; i < pmix_nlranks; i++) { id.vpid = pmix_lranks[i]; id.jobid = pmix_jobid; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s checking out if %s is local to me", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(id)); /* fetch cpuset for this vpid */ #if OPAL_HAVE_HWLOC OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != (rc = opal_pmix_base_fetch(&id, OPAL_PMIX_CPUSET, &vals))) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s cpuset for local proc %s not found", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(id)); OPAL_LIST_DESTRUCT(&vals); /* even though the cpuset wasn't found, we at least know it is * on the same node with us */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { kp = (opal_value_t*)opal_list_get_first(&vals); if (NULL == kp->data.string) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, cpuset, kp->data.string); } OPAL_LIST_DESTRUCT(&vals); } #else /* all we know is we share a node */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; #endif OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:cray proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(id), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_PMIX_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; opal_pmix_base_store(&pname, &kvn); OBJ_DESTRUCT(&kvn); } fn_exit: #if OPAL_HAVE_HWLOC if (NULL != cpuset) { free(cpuset); } #endif if (all_lens != NULL) { free(all_lens); } if (rcv_buff != NULL) { free(rcv_buff); } if (r_bytes_and_ranks != NULL) { free(r_bytes_and_ranks); } return rc; }
/* * Function for selecting one component from all those that are * available. */ void orte_rmaps_base_map_job(int fd, short args, void *cbdata) { orte_job_t *jdata; orte_job_map_t *map; int rc; bool did_map; opal_list_item_t *item; orte_rmaps_base_selected_module_t *mod; orte_job_t *parent; orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; /* convenience */ jdata = caddy->jdata; jdata->state = ORTE_JOB_STATE_MAP; /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A * NULL MAP FIELD * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY * DIDN'T SET IT */ if (NULL == jdata->map) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: creating new map for job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* create a map object where we will store the results */ map = OBJ_NEW(orte_job_map_t); if (NULL == map) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* load it with the system defaults */ map->mapping = orte_rmaps_base.mapping; map->ranking = orte_rmaps_base.ranking; #if OPAL_HAVE_HWLOC map->binding = opal_hwloc_binding_policy; #endif if (NULL != orte_rmaps_base.ppr) { map->ppr = strdup(orte_rmaps_base.ppr); } map->cpus_per_rank = orte_rmaps_base.cpus_per_rank; map->display_map = orte_rmaps_base.display_map; /* assign the map object to this job */ jdata->map = map; } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: setting mapping policies for job %s", ORTE_JOBID_PRINT(jdata->jobid)); if (!jdata->map->display_map) { jdata->map->display_map = orte_rmaps_base.display_map; } /* set the default mapping policy IFF it wasn't provided */ if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) { ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping); } if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)); } /* ditto for rank and bind policies */ if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) { ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking); } #if OPAL_HAVE_HWLOC if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { jdata->map->binding = opal_hwloc_binding_policy; } #endif } #if OPAL_HAVE_HWLOC /* if we are not going to launch, then we need to set any * undefined topologies to match our own so the mapper * can operate */ if (orte_do_not_launch) { orte_node_t *node; hwloc_topology_t t0; int i; node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); t0 = node->topology; for (i=1; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (NULL == node->topology) { node->topology = t0; } } } #endif /* cycle thru the available mappers until one agrees to map * the job */ did_map = false; if (1 == opal_list_get_size(&orte_rmaps_base.selected_modules)) { /* forced selection */ mod = (orte_rmaps_base_selected_module_t*)opal_list_get_first(&orte_rmaps_base.selected_modules); jdata->map->req_mapper = strdup(mod->component->mca_component_name); } for (item = opal_list_get_first(&orte_rmaps_base.selected_modules); item != opal_list_get_end(&orte_rmaps_base.selected_modules); item = opal_list_get_next(item)) { mod = (orte_rmaps_base_selected_module_t*)item; if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata)) || ORTE_ERR_RESOURCE_BUSY == rc) { did_map = true; break; } /* mappers return "next option" if they didn't attempt to * map the job. anything else is a true error. */ if (ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } } if (did_map && ORTE_ERR_RESOURCE_BUSY == rc) { /* the map was done but nothing could be mapped * for launch as all the resources were busy */ OBJ_RELEASE(caddy); return; } /* if we get here without doing the map, or with zero procs in * the map, then that's an error */ if (!did_map || 0 == jdata->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "failed-map", true); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } /* compute and save local ranks */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #if OPAL_HAVE_HWLOC /* compute and save bindings */ if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) { ORTE_ERROR_LOG(rc); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } #endif /* set the offset so shared memory components can potentially * connect to any spawned jobs */ jdata->offset = orte_total_procs; /* track the total number of procs launched by us */ orte_total_procs += jdata->num_procs; /* if it is a dynamic spawn, save the bookmark on the parent's job too */ if (ORTE_JOBID_INVALID != jdata->originator.jobid) { if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) { parent->bookmark = jdata->bookmark; } } /* if we wanted to display the map, now is the time to do it - ignore * daemon job */ if (jdata->map->display_map) { char *output=NULL; int i, j; orte_node_t *node; orte_proc_t *proc; if (orte_display_diffable_output) { /* intended solely to test mapping methods, this output * can become quite long when testing at scale. Rather * than enduring all the malloc/free's required to * create an arbitrary-length string, custom-generate * the output a line at a time here */ /* display just the procs in a diffable format */ opal_output(orte_clean_output, "<map>\n\t<jobid=%s>\n\t<offset=%s>", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset)); fflush(stderr); /* loop through nodes */ for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name); fflush(stderr); for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } #if OPAL_HAVE_HWLOC { char locale[64]; if (NULL != proc->locale) { hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset); } opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank, locale, (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap); } #else opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>", ORTE_VPID_PRINT(proc->name.vpid), (long)proc->app_idx, (unsigned long)proc->local_rank, (unsigned long)proc->node_rank); #endif fflush(stderr); } opal_output(orte_clean_output, "\t</host>"); fflush(stderr); } #if OPAL_HAVE_HWLOC { opal_hwloc_locality_t locality; orte_proc_t *p0; /* test locality - for the first node, print the locality of each proc relative to the first one */ node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0); p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0); opal_output(orte_clean_output, "\t<locality>"); for (j=1; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } locality = opal_hwloc_base_get_relative_locality(node->topology, p0->cpu_bitmap, proc->cpu_bitmap); opal_output(orte_clean_output, "\t\t<rank=%s rank=%s locality=%s>", ORTE_VPID_PRINT(p0->name.vpid), ORTE_VPID_PRINT(proc->name.vpid), opal_hwloc_base_print_locality(locality)); } opal_output(orte_clean_output, "\t</locality>\n</map>"); fflush(stderr); } #else opal_output(orte_clean_output, "\n</map>"); fflush(stderr); #endif } else { opal_output(orte_clean_output, " Data for JOB %s offset %s", ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(jdata->offset)); opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP); if (orte_xml_output) { fprintf(orte_xml_fp, "%s\n", output); fflush(orte_xml_fp); } else { opal_output(orte_clean_output, "%s", output); } free(output); } } /* set the job state to the next position */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE); /* cleanup */ OBJ_RELEASE(caddy); }
static bool native_get_attr(const char *attr, opal_value_t **kv) { opal_buffer_t *msg, *bptr; opal_list_t vals; opal_value_t *kp, *lclpeers=NULL, kvn; pmix_cmd_t cmd = PMIX_GETATTR_CMD; char **ranks; int rc, ret; int32_t cnt; bool found=false; opal_hwloc_locality_t locality; pmix_cb_t *cb; uint32_t i, myrank; opal_process_name_t id; char *cpuset; opal_buffer_t buf, buf2; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:native get_attr called", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* try to retrieve the requested value from the dstore */ OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS == opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, attr, &vals)) { *kv = (opal_value_t*)opal_list_remove_first(&vals); OPAL_LIST_DESTRUCT(&vals); return true; } if (NULL == mca_pmix_native_component.uri) { /* no server available, so just return */ return false; } /* if the value isn't yet available, then we should try to retrieve * all the available attributes and store them for future use */ msg = OBJ_NEW(opal_buffer_t); /* pack the cmd */ if (OPAL_SUCCESS != (rc = opal_dss.pack(msg, &cmd, 1, PMIX_CMD_T))) { OPAL_ERROR_LOG(rc); OBJ_RELEASE(msg); return false; } /* create a callback object as we need to pass it to the * recv routine so we know which callback to use when * the return message is recvd */ cb = OBJ_NEW(pmix_cb_t); cb->active = true; /* push the message into our event base to send to the server */ PMIX_ACTIVATE_SEND_RECV(msg, wait_cbfunc, cb); /* wait for the data to return */ PMIX_WAIT_FOR_COMPLETION(cb->active); /* we have received the entire data blob for this process - unpack * and cache all values, keeping the one we requested to return * to the caller */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&cb->data, &ret, &cnt, OPAL_INT))) { OPAL_ERROR_LOG(rc); OBJ_RELEASE(cb); return false; } if (OPAL_SUCCESS == ret) { /* unpack the buffer containing the values */ cnt = 1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) { OPAL_ERROR_LOG(rc); OBJ_RELEASE(cb); return false; } cnt = 1; while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s unpacked attr %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key); /* if this is the local topology, we need to save it in a special way */ #if OPAL_HAVE_HWLOC { hwloc_topology_t topo; if (0 == strcmp(PMIX_LOCAL_TOPO, kp->key)) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s saving topology", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* transfer the byte object for unpacking */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size); kp->data.bo.bytes = NULL; // protect the data region kp->data.bo.size = 0; OBJ_RELEASE(kp); /* extract the topology */ cnt=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &topo, &cnt, OPAL_HWLOC_TOPO))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); continue; } OBJ_DESTRUCT(&buf); if (NULL == opal_hwloc_topology) { opal_hwloc_topology = topo; } else { hwloc_topology_destroy(topo); } cnt = 1; continue; } } #endif /* if this is the local cpuset blob, then unpack and store its contents */ if (0 == strcmp(PMIX_LOCAL_CPUSETS, kp->key)) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s received local cpusets", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* transfer the byte object for unpacking */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size); kp->data.bo.bytes = NULL; // protect the data region kp->data.bo.size = 0; OBJ_RELEASE(kp); cnt=1; while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &id, &cnt, OPAL_NAME))) { cnt=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &cpuset, &cnt, OPAL_STRING))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); cnt = 1; continue; } opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s saving cpuset %s for local peer %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), (NULL == cpuset) ? "NULL" : cpuset, OPAL_NAME_PRINT(id)); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_CPUSET); kvn.type = OPAL_STRING; kvn.data.string = cpuset; if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, &kvn))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kvn); cnt = 1; continue; } OBJ_DESTRUCT(&kvn); } OBJ_DESTRUCT(&buf); if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { OPAL_ERROR_LOG(rc); return false; } cnt=1; continue; } else if (0 == strcmp(PMIX_PROC_MAP, kp->key)) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s received proc map", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* transfer the byte object for unpacking */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size); kp->data.bo.bytes = NULL; // protect the data region kp->data.bo.size = 0; OBJ_RELEASE(kp); /* get the jobid */ cnt=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); cnt = 1; return false; } if (0 != strcmp(PMIX_JOBID, kp->key)) { OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM); OBJ_DESTRUCT(&buf); OBJ_RELEASE(kp); cnt = 1; return false; } id.jobid = kp->data.uint32; OBJ_RELEASE(kp); /* unpack the data for each rank */ cnt=1; while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) { if (0 != strcmp(PMIX_RANK, kp->key)) { OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM); OBJ_DESTRUCT(&buf); OBJ_RELEASE(kp); cnt = 1; return false; } id.vpid = kp->data.uint32; /* unpack the blob for this rank */ cnt=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); cnt = 1; return false; } if (0 != strcmp(PMIX_PROC_MAP, kp->key)) { OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM); OBJ_DESTRUCT(&buf); OBJ_RELEASE(kp); cnt = 1; return false; } /* transfer the byte object for unpacking */ OBJ_CONSTRUCT(&buf2, opal_buffer_t); opal_dss.load(&buf2, kp->data.bo.bytes, kp->data.bo.size); kp->data.bo.bytes = NULL; // protect the data region kp->data.bo.size = 0; OBJ_RELEASE(kp); /* unpack and store the map */ cnt=1; while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf2, &kp, &cnt, OPAL_VALUE))) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s storing key %s for peer %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key, OPAL_NAME_PRINT(id)); if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, kp))) { OPAL_ERROR_LOG(rc); OBJ_RELEASE(kp); OBJ_DESTRUCT(&buf2); return false; } } OBJ_DESTRUCT(&buf2); if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { OPAL_ERROR_LOG(rc); return false; } cnt=1; } OBJ_DESTRUCT(&buf); if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { OPAL_ERROR_LOG(rc); return false; } cnt=1; continue; } /* otherwise, it is a single piece of info, so store it */ if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &OPAL_PROC_MY_NAME, kp))) { OPAL_ERROR_LOG(rc); OBJ_RELEASE(kp); cnt = 1; continue; } /* save the list of local peers */ if (0 == strcmp(PMIX_LOCAL_PEERS, kp->key)) { OBJ_RETAIN(kp); lclpeers = kp; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s saving local peers %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), lclpeers->data.string); } else if (0 == strcmp(PMIX_JOBID, kp->key)) { native_pname.jobid = kp->data.uint32; } else if (0 == strcmp(PMIX_RANK, kp->key)) { native_pname.vpid = kp->data.uint32; } if (0 == strcmp(attr, kp->key)) { OBJ_RETAIN(kp); *kv = kp; found = true; } OBJ_RELEASE(kp); cnt = 1; } OBJ_RELEASE(bptr); if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { OPAL_ERROR_LOG(rc); return false; } } else { OPAL_ERROR_LOG(ret); OBJ_RELEASE(cb); return false; } OBJ_RELEASE(cb); opal_proc_set_name(&native_pname); /* if the list of local peers wasn't included, then we are done */ if (NULL == lclpeers) { opal_output_verbose(0, opal_pmix_base_framework.framework_output, "%s no local peers reported", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); return found; } /* baseline all the procs as nonlocal */ myrank = native_pname.vpid; id.jobid = native_pname.jobid; #if OPAL_HAVE_HWLOC /* fetch my cpuset */ OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS == (rc = opal_dstore.fetch(opal_dstore_internal, &native_pname, OPAL_DSTORE_CPUSET, &vals))) { kp = (opal_value_t*)opal_list_get_first(&vals); cpuset = strdup(kp->data.string); } else { cpuset = NULL; } OPAL_LIST_DESTRUCT(&vals); #endif /* we only need to set locality for each local rank as "not found" * equates to "non local" */ ranks = opal_argv_split(lclpeers->data.string, ','); for (i=0; NULL != ranks[i]; i++) { uint32_t vid = strtoul(ranks[i], NULL, 10); if (myrank == vid) { continue; } id.vpid = vid; #if OPAL_HAVE_HWLOC OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &id, OPAL_DSTORE_CPUSET, &vals))) { opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s cpuset for local proc %s not found", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(id)); OPAL_LIST_DESTRUCT(&vals); /* even though the cpuset wasn't found, we at least know it is * on the same node with us */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { kp = (opal_value_t*)opal_list_get_first(&vals); if (NULL == kp->data.string) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, cpuset, kp->data.string); } OPAL_LIST_DESTRUCT(&vals); } #else /* all we know is we share a node */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; #endif OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:native proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(id), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; (void)opal_dstore.store(opal_dstore_internal, &id, &kvn); OBJ_DESTRUCT(&kvn); } #if OPAL_HAVE_HWLOC if (NULL != cpuset) { free(cpuset); } #endif opal_argv_free(ranks); return found; }
/* only APPS call this function - daemons have their own */ int orte_util_decode_pidmap(opal_byte_object_t *bo) { orte_vpid_t i, num_procs, *vptr, daemon; orte_vpid_t *daemons=NULL; orte_local_rank_t *local_rank=NULL; orte_node_rank_t *node_rank=NULL; #if OPAL_HAVE_HWLOC opal_hwloc_level_t bind_level = OPAL_HWLOC_NODE_LEVEL, pbind, *lvptr; unsigned int *bind_idx=NULL, pbidx, *uiptr; #endif opal_hwloc_locality_t locality; orte_std_cntr_t n; opal_buffer_t buf; int rc; orte_proc_state_t *states = NULL; orte_app_idx_t *app_idx = NULL; int32_t *restarts = NULL; orte_process_name_t proc, dmn; orte_namelist_t *nm; opal_list_t jobs; char *hostname; /* xfer the byte object to a buffer for unpacking */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.load(&buf, bo->bytes, bo->size))) { ORTE_ERROR_LOG(rc); goto cleanup; } n = 1; /* cycle through the buffer */ OBJ_CONSTRUCT(&jobs, opal_list_t); while (ORTE_SUCCESS == (rc = opal_dss.unpack(&buf, &proc.jobid, &n, ORTE_JOBID))) { OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap working job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(proc.jobid))); /* record the jobid */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = proc.jobid; opal_list_append(&jobs, &nm->super); /* unpack and store the number of procs */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &num_procs, &n, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } proc.vpid = ORTE_VPID_INVALID; if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NPROCS, &num_procs, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } #if OPAL_HAVE_HWLOC /* unpack and store the binding level */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &bind_level, &n, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* store it */ proc.vpid = ORTE_VPID_INVALID; if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_LEVEL, &bind_level, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* set mine */ if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { orte_process_info.bind_level = bind_level; } OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap nprocs %s bind level %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(num_procs), opal_hwloc_base_print_level(bind_level))); #endif /* allocate memory for the daemon info */ daemons = (orte_vpid_t*)malloc(num_procs * sizeof(orte_vpid_t)); /* unpack it in one shot */ n=num_procs; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, daemons, &n, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* allocate memory for local ranks */ local_rank = (orte_local_rank_t*)malloc(num_procs*sizeof(orte_local_rank_t)); /* unpack them in one shot */ n=num_procs; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, local_rank, &n, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { /* set mine */ orte_process_info.my_local_rank = local_rank[ORTE_PROC_MY_NAME->vpid]; if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_LOCALRANK, &orte_process_info.my_local_rank, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } } /* allocate memory for node ranks */ node_rank = (orte_node_rank_t*)malloc(num_procs*sizeof(orte_node_rank_t)); /* unpack node ranks in one shot */ n=num_procs; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, node_rank, &n, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { /* set mine */ orte_process_info.my_node_rank = node_rank[ORTE_PROC_MY_NAME->vpid]; if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_NODERANK, &orte_process_info.my_node_rank, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } } #if OPAL_HAVE_HWLOC /* allocate memory for bind_idx */ bind_idx = (unsigned int*)malloc(num_procs*sizeof(unsigned int)); /* unpack bind_idx in one shot */ n=num_procs; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, bind_idx, &n, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (proc.jobid == ORTE_PROC_MY_NAME->jobid) { /* set mine */ orte_process_info.bind_idx = bind_idx[ORTE_PROC_MY_NAME->vpid]; if (ORTE_SUCCESS != (rc = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_BIND_INDEX, &orte_process_info.bind_idx, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } } #endif /* allocate memory for states */ states = (orte_proc_state_t*)malloc(num_procs*sizeof(orte_proc_state_t)); /* unpack states in one shot */ n=num_procs; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, states, &n, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* dump this info - apps don't need it */ free(states); states = NULL; /* allocate memory for app_idx's */ app_idx = (orte_app_idx_t*)malloc(num_procs*sizeof(orte_app_idx_t)); /* unpack app_idx's in one shot */ n=num_procs; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, app_idx, &n, ORTE_APP_IDX))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* dump this info - apps don't need it */ free(app_idx); app_idx = NULL; /* allocate memory for restarts */ restarts = (int32_t*)malloc(num_procs*sizeof(int32_t)); /* unpack restarts in one shot */ n=num_procs; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, restarts, &n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* dump this info - apps don't need it */ free(restarts); restarts = NULL; /* set the daemon jobid */ dmn.jobid = ORTE_DAEMON_JOBID(ORTE_PROC_MY_NAME->jobid); /* xfer the data */ for (i=0; i < num_procs; i++) { if (proc.jobid == ORTE_PROC_MY_NAME->jobid && i == ORTE_PROC_MY_NAME->vpid) { continue; } proc.vpid = i; if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_DAEMON_VPID, &daemons[i], ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* lookup and store the hostname for this proc */ dmn.vpid = daemons[i]; if (ORTE_SUCCESS != (rc = orte_db.fetch_pointer(&dmn, ORTE_DB_HOSTNAME, (void**)&hostname, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank[i], ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank[i], ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } #if OPAL_HAVE_HWLOC if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_BIND_INDEX, &bind_idx[i], OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((10, orte_nidmap_output, "%s orte:util:decode:pidmap proc %s host %s lrank %d nrank %d bindidx %u", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc), hostname, (int)local_rank[i], (int)node_rank[i], bind_idx[i])); #endif } /* release data */ free(daemons); daemons = NULL; free(local_rank); local_rank = NULL; free(node_rank); node_rank = NULL; #if OPAL_HAVE_HWLOC free(bind_idx); bind_idx = NULL; #endif /* setup for next cycle */ n = 1; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } rc = ORTE_SUCCESS; /* now that we have all the data, we are guaranteed * to know our own node, so go back and record the * locality of each proc relative to me */ while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&jobs))) { proc.jobid = nm->name.jobid; /* recover the number of procs in this job */ vptr = &num_procs; proc.vpid = ORTE_VPID_INVALID; if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_NPROCS, (void**)&vptr, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } for (i=0; i < num_procs; i++) { if (ORTE_PROC_MY_NAME->vpid == i && ORTE_PROC_MY_NAME->jobid == proc.jobid) { /* this is me */ continue; } proc.vpid = i; /* recover the daemon for this proc */ vptr = &daemon; if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_DAEMON_VPID, (void**)&vptr, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (daemon == ORTE_PROC_MY_DAEMON->vpid) { OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap proc %s shares node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc))); /* we share a node, so add them to the count of peers * sharing the node with me */ orte_process_info.num_local_peers++; #if OPAL_HAVE_HWLOC /* retrieve the bind level for the other proc's job */ lvptr = &pbind; proc.vpid = ORTE_VPID_INVALID; if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_LEVEL, (void**)&lvptr, OPAL_HWLOC_LEVEL_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* retrieve the other's proc's bind idx */ uiptr = &pbidx; proc.vpid = i; if (ORTE_SUCCESS != (rc = orte_db.fetch(&proc, ORTE_DB_BIND_INDEX, (void**)&uiptr, OPAL_UINT))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* we share a node - see what else we share */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, orte_process_info.bind_level, orte_process_info.bind_idx, pbind, pbidx); #else locality = OPAL_PROC_ON_NODE; #endif } else { /* we don't share a node */ OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap proc %s does NOT node [my daemon %s, their daemon %s]", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc), ORTE_VPID_PRINT(ORTE_PROC_MY_DAEMON->vpid), ORTE_VPID_PRINT(daemon))); locality = OPAL_PROC_NON_LOCAL; } /* store the locality */ OPAL_OUTPUT_VERBOSE((2, orte_nidmap_output, "%s orte:util:decode:pidmap set proc %s locality to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc), opal_hwloc_base_print_locality(locality))); if (ORTE_SUCCESS != (rc = orte_db.store(&proc, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } cleanup: if (NULL != daemons) { free(daemons); } if (NULL != local_rank) { free(local_rank); } if (NULL != node_rank) { free(node_rank); } #if OPAL_HAVE_HWLOC if (NULL != bind_idx) { free(bind_idx); } #endif if (NULL != states) { free(states); } if (NULL != app_idx) { free(app_idx); } if (NULL != restarts) { free(restarts); } OBJ_DESTRUCT(&buf); return rc; }
void orte_grpcomm_base_store_peer_modex(opal_buffer_t *rbuf, void *cbdata) { int rc, cnt; orte_process_name_t pname; char *hostname; orte_vpid_t daemon; orte_node_rank_t node_rank; orte_local_rank_t local_rank; orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata; opal_hwloc_locality_t locality; OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s STORING PEER MODEX DATA", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* unpack the process name */ cnt=1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &pname, &cnt, ORTE_NAME))) { /* unpack and store the hostname */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &hostname, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, hostname, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack and store the daemon vpid */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &daemon, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_DAEMON_VPID, &daemon, OPAL_UINT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack and store the node rank */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the local rank */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* compute the locality and store in the database */ #if OPAL_HAVE_HWLOC { char *cpuset; /* unpack and store the cpuset - could be NULL */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &cpuset, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_CPUSET, cpuset, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex setting proc %s cpuset %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname), cpuset)); if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &pname, ORTE_PROC_MY_NAME)) { /* if this data is from myself, then set locality to all */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex setting proc %s locale ALL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname))); locality = OPAL_PROC_ALL_LOCAL; } else if (daemon != ORTE_PROC_MY_DAEMON->vpid) { /* this is on a different node, then mark as non-local */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex setting proc %s locale NONLOCAL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname))); locality = OPAL_PROC_NON_LOCAL; } else if (NULL == cpuset || NULL == orte_process_info.cpuset) { /* one or both of us is not bound, so all we can say is we are on the * same node */ locality = OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, orte_process_info.cpuset, cpuset); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex setting proc %s locale %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname), opal_hwloc_base_print_locality(locality))); } } #else if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &pname, ORTE_PROC_MY_NAME)) { /* if this data is from myself, then set locality to all */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:modex setting proc %s locale ALL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname))); locality = OPAL_PROC_ALL_LOCAL; } else if (daemon != ORTE_PROC_MY_DAEMON->vpid) { /* this is on a different node, then mark as non-local */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex setting proc %s locale NONLOCAL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname))); locality = OPAL_PROC_NON_LOCAL; } else { /* must be on our node */ locality = OPAL_PROC_ON_NODE; } #endif if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&pname, OPAL_DB_INTERNAL, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex: adding modex entry for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname))); /* update the modex database */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&pname, rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex: completed modex entry for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname))); } cleanup: /* flag the collective as complete */ modex->active = false; /* cleanup the list, but don't release the * collective object as it was passed into us */ opal_list_remove_item(&orte_grpcomm_base.active_colls, &modex->super); /* notify that the modex is complete */ if (NULL != modex->cbfunc) { OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s CALLING MODEX RELEASE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); modex->cbfunc(NULL, modex->cbdata); } else { OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex NO MODEX RELEASE CBFUNC", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } }
static void fencenb(int sd, short args, void *cbdata) { pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata; int rc = OPAL_SUCCESS; int32_t i; opal_value_t *kp, kvn; opal_hwloc_locality_t locality; opal_process_name_t pname; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s2 called fence", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* check if there is partially filled meta key and put them */ opal_pmix_base_commit_packed (&pmix_packed_data, &pmix_packed_data_offset, &pmix_packed_encoded_data, &pmix_packed_encoded_data_offset, pmix_vallen_max, &pmix_pack_key, kvs_put); /* now call fence */ if (PMI2_SUCCESS != PMI2_KVS_Fence()) { rc = OPAL_ERROR; goto cleanup; } /* get the modex data from each local process and set the * localities to avoid having the MPI layer fetch data * for every process in the job */ pname.jobid = OPAL_PROC_MY_NAME.jobid; if (!got_modex_data) { got_modex_data = true; /* we only need to set locality for each local rank as "not found" * equates to "non-local" */ for (i=0; i < s2_nlranks; i++) { pname.vpid = s2_lranks[i]; rc = opal_pmix_base_cache_keys_locally(&s2_pname, OPAL_PMIX_CPUSET, &kp, pmix_kvs_name, pmix_vallen_max, kvs_get); if (OPAL_SUCCESS != rc) { OPAL_ERROR_LOG(rc); goto cleanup; } if (NULL == kp || NULL == kp->data.string) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, opal_process_info.cpuset, kp->data.string); } if (NULL != kp) { OBJ_RELEASE(kp); } OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:s2 proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(s2_pname), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_PMIX_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; opal_pmix_base_store(&pname, &kvn); OBJ_DESTRUCT(&kvn); } } cleanup: if (NULL != op->opcbfunc) { op->opcbfunc(rc, op->cbdata); } OBJ_RELEASE(op); return; }
static int rte_init(void) { int ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; opal_value_t *kv; char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; char **peers=NULL, *mycpuset; opal_process_name_t wildcard_rank, pname; bool bool_val, *bool_ptr = &bool_val, tdir_mca_override = false; size_t i; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_progress_thread_init(NULL); progress_thread_running = true; /* open and setup pmix */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { ORTE_ERROR_LOG(ret); /* we cannot run */ error = "pmix init"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) { /* we cannot run */ error = "pmix init"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) { /* we cannot run - this could be due to being direct launched * without the required PMI support being built. Try to detect * that scenario and warn the user */ if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() && NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) { if (0 == strcmp(envar, "SLURM")) { /* yes to both - so emit a hopefully helpful * error message and abort */ orte_show_help_finalize(); orte_show_help("help-ess-base.txt", "slurm-error", true); return ORTE_ERR_SILENT; } else if (0 == strcmp(envar, "ALPS")) { /* we were direct launched by ALPS */ orte_show_help_finalize(); orte_show_help("help-ess-base.txt", "alps-error", true); return ORTE_ERR_SILENT; } } error = "pmix init"; goto error; } u32ptr = &u32; u16ptr = &u16; /**** THE FOLLOWING ARE REQUIRED VALUES ***/ /* pmix.init set our process name down in the OPAL layer, * so carry it forward here */ ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; /* setup a name for retrieving data associated with the job */ wildcard_rank.jobid = ORTE_PROC_MY_NAME->jobid; wildcard_rank.vpid = ORTE_NAME_WILDCARD->vpid; /* setup a name for retrieving proc-specific data */ pname.jobid = ORTE_PROC_MY_NAME->jobid; pname.vpid = 0; /* get our local rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting local rank"; goto error; } orte_process_info.my_local_rank = u16; /* get our node rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting node rank"; goto error; } orte_process_info.my_node_rank = u16; /* get max procs for this application */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS, &wildcard_rank, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting max procs"; goto error; } orte_process_info.max_procs = u32; /* get job size */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_JOB_SIZE, &wildcard_rank, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting job size"; goto error; } orte_process_info.num_procs = u32; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.app_num = u32; } else { orte_process_info.app_num = 0; } /* get the number of local peers - required for wireup of * shared memory BTL */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE, &wildcard_rank, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.num_local_peers = u32 - 1; // want number besides ourselves } else { orte_process_info.num_local_peers = 0; } /* get number of nodes in the job */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NUM_NODES, &wildcard_rank, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.num_nodes = u32; } /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } opal_output_verbose(2, orte_ess_base_framework.framework_output, "%s transport key %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), string_key); asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } /* retrieve temp directories info */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.top_session_dir ){ orte_process_info.top_session_dir = val; } else { /* keep the MCA setting */ tdir_mca_override = true; free(val); } val = NULL; } if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.job_session_dir ){ orte_process_info.job_session_dir = val; } else { /* keep the MCA setting */ free(val); tdir_mca_override = true; } val = NULL; } } if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.proc_session_dir ){ orte_process_info.proc_session_dir = val; } else { /* keep the MCA setting */ tdir_mca_override = true; free(val); } val = NULL; } } if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TDIR_RMCLEAN, &wildcard_rank, &bool_ptr, OPAL_BOOL); if (OPAL_SUCCESS == ret ) { orte_process_info.rm_session_dirs = bool_val; } } /* get our local peers */ if (0 < orte_process_info.num_local_peers) { /* if my local rank if too high, then that's an error */ if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) { ret = ORTE_ERR_BAD_PARAM; error = "num local peers"; goto error; } /* retrieve the local peers */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { peers = opal_argv_split(val, ','); free(val); } else { peers = NULL; } } else { peers = NULL; } /* set the locality */ if (NULL != peers) { /* identify our location */ val = NULL; OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { mycpuset = val; } else { mycpuset = NULL; } pname.jobid = ORTE_PROC_MY_NAME->jobid; for (i=0; NULL != peers[i]; i++) { pname.vpid = strtoul(peers[i], NULL, 10); if (pname.vpid == ORTE_PROC_MY_NAME->vpid) { /* we are fully local to ourselves */ u16 = OPAL_PROC_ALL_LOCAL; } else { val = NULL; OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, &pname, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { u16 = opal_hwloc_compute_relative_locality(mycpuset, val); free(val); } else { /* all we can say is that it shares our node */ u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } } kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCALITY); kv->type = OPAL_UINT16; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, "%s ess:pmi:locality: proc %s locality %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname), opal_hwloc_base_print_locality(u16))); kv->data.uint16 = u16; ret = opal_pmix.store_local(&pname, kv); if (OPAL_SUCCESS != ret) { error = "local store of locality"; opal_argv_free(peers); if (NULL != mycpuset) { free(mycpuset); } goto error; } OBJ_RELEASE(kv); } opal_argv_free(peers); if (NULL != mycpuset) { free(mycpuset); } } /* now that we have all required info, complete the setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } /* this needs to be set to enable debugger use when direct launched */ if (NULL == orte_process_info.my_daemon_uri) { orte_standalone_operation = true; } /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /* push our hostname so others can find us, if they need to - the * native PMIx component will ignore this request as the hostname * is provided by the system */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "db store hostname"; goto error; } /* if we are an ORTE app - and not an MPI app - then * we need to exchange our connection info here. * MPI_Init has its own modex, so we don't need to do * two of them. However, if we don't do a modex at all, * then processes have no way to communicate * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { /* need to commit the data before we fence */ opal_pmix.commit(); opal_pmix.fence(NULL, 0); } return ORTE_SUCCESS; error: if (!progress_thread_running) { /* can't send the help message, so ensure it * comes out locally */ orte_show_help_finalize(); } if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }