static int s1_init(void) { PMI_BOOL initialized; int spawned; int rc, ret = OPAL_ERROR; int i, rank, lrank, nrank; char *pmix_id, tmp[64]; opal_value_t kv; char *str; uint32_t ui32; opal_process_name_t ldr; char **localranks=NULL; if (PMI_SUCCESS != (rc = PMI_Initialized(&initialized))) { OPAL_PMI_ERROR(rc, "PMI_Initialized"); return OPAL_ERROR; } if (PMI_TRUE != initialized && PMI_SUCCESS != (rc = PMI_Init(&spawned))) { OPAL_PMI_ERROR(rc, "PMI_Init"); return OPAL_ERROR; } // setup hash table opal_pmix_base_hash_init(); // Initialize space demands rc = PMI_KVS_Get_value_length_max(&pmix_vallen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_value_length_max"); goto err_exit; } pmix_vallen_threshold = pmix_vallen_max * 3; pmix_vallen_threshold >>= 2; rc = PMI_KVS_Get_name_length_max(&pmix_kvslen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max"); goto err_exit; } rc = PMI_KVS_Get_key_length_max(&pmix_keylen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_key_length_max"); goto err_exit; } // Initialize job environment information pmix_id = (char*)malloc(pmix_vallen_max); if (pmix_id == NULL) { ret = OPAL_ERR_OUT_OF_RESOURCE; goto err_exit; } /* Get domain id */ if (PMI_SUCCESS != (rc = PMI_Get_kvs_domain_id(pmix_id, pmix_vallen_max))) { free(pmix_id); goto err_exit; } /* get our rank */ ret = PMI_Get_rank(&rank); if( PMI_SUCCESS != ret ) { OPAL_PMI_ERROR(ret, "PMI_Get_rank"); free(pmix_id); goto err_exit; } /* Slurm PMI provides the job id as an integer followed * by a '.', followed by essentially a stepid. The first integer * defines an overall job number. The second integer is the number of * individual jobs we have run within that allocation. */ s1_pname.jobid = strtoul(pmix_id, &str, 10); s1_pname.jobid = (s1_pname.jobid << 16) & 0xffff0000; if (NULL != str) { ui32 = strtoul(str, NULL, 10); s1_pname.jobid |= (ui32 & 0x0000ffff); } ldr.jobid = s1_pname.jobid; s1_pname.vpid = rank; /* store our name in the opal_proc_t so that * debug messages will make sense - an upper * layer will eventually overwrite it, but that * won't do any harm */ opal_proc_set_name(&s1_pname); opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s1: assigned tmp name", OPAL_NAME_PRINT(s1_pname)); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOBID); kv.type = OPAL_UINT32; kv.data.uint32 = s1_pname.jobid; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save it */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_RANK); kv.type = OPAL_UINT32; kv.data.uint32 = rank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); pmix_kvs_name = (char*)malloc(pmix_kvslen_max); if (pmix_kvs_name == NULL) { ret = OPAL_ERR_OUT_OF_RESOURCE; goto err_exit; } rc = PMI_KVS_Get_my_name(pmix_kvs_name, pmix_kvslen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_my_name"); goto err_exit; } /* get our local proc info to find our local rank */ if (PMI_SUCCESS != (rc = PMI_Get_clique_size(&nlranks))) { OPAL_PMI_ERROR(rc, "PMI_Get_clique_size"); return rc; } /* save the local size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = nlranks; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); lrank = 0; nrank = 0; ldr.vpid = rank; if (0 < nlranks) { /* now get the specific ranks */ lranks = (int*)calloc(nlranks, sizeof(int)); if (NULL == lranks) { rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } if (PMI_SUCCESS != (rc = PMI_Get_clique_ranks(lranks, nlranks))) { OPAL_PMI_ERROR(rc, "PMI_Get_clique_ranks"); free(lranks); return rc; } /* note the local ldr */ ldr.vpid = lranks[0]; /* save this */ memset(tmp, 0, 64); for (i=0; i < nlranks; i++) { (void)snprintf(tmp, 64, "%d", lranks[i]); opal_argv_append_nosize(&localranks, tmp); if (rank == lranks[i]) { lrank = i; nrank = i; } } str = opal_argv_join(localranks, ','); opal_argv_free(localranks); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_PEERS); kv.type = OPAL_STRING; kv.data.string = str; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); } /* save the local leader */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCALLDR); kv.type = OPAL_UINT64; kv.data.uint64 = *(uint64_t*)&ldr; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save our local rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = lrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* and our node rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_NODE_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = nrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* get universe size */ ret = PMI_Get_universe_size(&i); if (PMI_SUCCESS != ret) { OPAL_PMI_ERROR(ret, "PMI_Get_universe_size"); goto err_exit; } /* push this into the dstore for subsequent fetches */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_UNIV_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = i; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* get job size */ ret = PMI_Get_size(&i); if (PMI_SUCCESS != ret) { OPAL_PMI_ERROR(ret, "PMI_Get_size"); goto err_exit; } OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOB_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = i; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* get appnum */ ret = PMI_Get_appnum(&i); if (PMI_SUCCESS != ret) { OPAL_PMI_ERROR(ret, "PMI_Get_appnum"); goto err_exit; } OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_APPNUM); kv.type = OPAL_UINT32; kv.data.uint32 = i; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* increment the init count */ ++pmix_init_count; return OPAL_SUCCESS; err_exit: PMI_Finalize(); return ret; }
static int isolated_init(void) { int rc; opal_value_t kv; ++isolated_init_count; /* store our name in the opal_proc_t so that * debug messages will make sense - an upper * layer will eventually overwrite it, but that * won't do any harm */ isolated_pname.jobid = 1; isolated_pname.vpid = 0; opal_proc_set_name(&isolated_pname); opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:isolated: assigned tmp name %d %d", OPAL_NAME_PRINT(isolated_pname),isolated_pname.jobid,isolated_pname.vpid); // setup hash table opal_pmix_base_hash_init(); /* save the job size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOB_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = 1; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the appnum */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_APPNUM); kv.type = OPAL_UINT32; kv.data.uint32 = 0; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_UNIV_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = 1; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOBID); kv.type = OPAL_UINT32; kv.data.uint32 = 1; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the local size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = 1; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_PEERS); kv.type = OPAL_STRING; kv.data.string = strdup("0"); if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the local leader */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCALLDR); kv.type = OPAL_UINT64; kv.data.uint64 = 0; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } /* save our local rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = 0; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } /* and our node rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_NODE_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = 0; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); return OPAL_SUCCESS; err_exit: return rc; }
static int cray_init(void) { int i, spawned, size, rank, appnum, my_node; int rc, ret = OPAL_ERROR; char *pmapping = NULL; char buf[PMI2_MAX_ATTRVALUE]; int found; int major, minor, revision; uint32_t jobfam; opal_value_t kv; opal_process_name_t ldr; char nmtmp[64]; char *str, **localranks = NULL; ++pmix_init_count; /* if we can't startup PMI, we can't be used */ if ( PMI2_Initialized () ) { opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray: pmi already initialized", OPAL_NAME_PRINT(pmix_pname)); return OPAL_SUCCESS; } size = -1; rank = -1; appnum = -1; if (PMI_SUCCESS != (rc = PMI2_Init(&spawned, &size, &rank, &appnum))) { opal_show_help("help-pmix-base.txt", "pmix2-init-failed", true, rc); return OPAL_ERROR; } if( size < 0 || rank < 0 ){ opal_show_help("help-pmix-base.txt", "pmix2-init-returned-bad-values", true); goto err_exit; } pmix_size = size; pmix_rank = rank; pmix_appnum = appnum; pmix_vallen_max = PMI2_MAX_VALLEN; pmix_kvslen_max = PMI2_MAX_VALLEN; // FIX ME: What to put here for versatility? pmix_keylen_max = PMI2_MAX_KEYLEN; pmix_vallen_threshold = PMI2_MAX_VALLEN * 3; pmix_vallen_threshold >>= 2; /* * get the version info */ if (PMI_SUCCESS != PMI_Get_version_info(&major,&minor,&revision)) { return OPAL_ERROR; } snprintf(cray_pmi_version, sizeof(cray_pmi_version), "%d.%d.%d", major, minor, revision); pmix_kvs_name = (char*)malloc(pmix_kvslen_max); if( pmix_kvs_name == NULL ){ PMI2_Finalize(); ret = OPAL_ERR_OUT_OF_RESOURCE; goto err_exit; } rc = PMI2_Job_GetId(pmix_kvs_name, pmix_kvslen_max); if( PMI_SUCCESS != rc ) { OPAL_PMI_ERROR(rc, "PMI2_Job_GetId"); goto err_exit; } rc = sscanf(pmix_kvs_name,"kvs_%u",&jobfam); if (rc != 1) { opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray: pmix_kvs_name %s", OPAL_NAME_PRINT(pmix_pname), pmix_kvs_name); rc = OPAL_ERROR; goto err_exit; } pmix_jobid = jobfam << 16; /* store our name in the opal_proc_t so that * debug messages will make sense - an upper * layer will eventually overwrite it, but that * won't do any harm */ pmix_pname.jobid = pmix_jobid; pmix_pname.vpid = pmix_rank; opal_proc_set_name(&pmix_pname); opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray: assigned tmp name %d %d pmix_kvs_name %s", OPAL_NAME_PRINT(pmix_pname),pmix_pname.jobid,pmix_pname.vpid,pmix_kvs_name); pmapping = (char*)malloc(PMI2_MAX_VALLEN); if( pmapping == NULL ){ rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found); if( !found || PMI_SUCCESS != rc ) { OPAL_PMI_ERROR(rc,"PMI2_Info_GetJobAttr"); return OPAL_ERROR; } pmix_lranks = pmix_cray_parse_pmap(pmapping, pmix_rank, &my_node, &pmix_nlranks); if (NULL == pmix_lranks) { rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } free(pmapping); // setup hash table opal_pmix_base_hash_init(); /* save the job size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOB_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = pmix_size; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the appnum */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_APPNUM); kv.type = OPAL_UINT32; kv.data.uint32 = pmix_appnum; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); rc = PMI2_Info_GetJobAttr("universeSize", buf, 16, &found); if( PMI_SUCCESS != rc ) { OPAL_PMI_ERROR(rc, "PMI_Get_universe_size"); goto err_exit; } pmix_usize = atoi(buf); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_UNIV_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = pmix_usize; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOBID); kv.type = OPAL_UINT32; kv.data.uint32 = pmix_jobid; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the local size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_SIZE); kv.type = OPAL_UINT16; kv.data.uint16 = pmix_nlranks; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); ldr.vpid = pmix_lranks[0]; ldr.jobid = pmix_pname.jobid; /* find ourselves and build up a string for local peer info */ memset(nmtmp, 0, 64); for (i=0; i < pmix_nlranks; i++) { ret = snprintf(nmtmp, 64, "%d", pmix_lranks[i]); opal_argv_append_nosize(&localranks, nmtmp); if (pmix_rank == pmix_lranks[i]) { pmix_lrank = i; pmix_nrank = i; } } str = opal_argv_join(localranks, ','); opal_argv_free(localranks); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_PEERS); kv.type = OPAL_STRING; kv.data.string = str; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the local leader */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCALLDR); kv.type = OPAL_UINT64; kv.data.uint64 = *(uint64_t*)&ldr; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } /* save our local rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = pmix_lrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } /* and our node rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_NODE_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = pmix_nrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); return OPAL_SUCCESS; err_exit: PMI2_Finalize(); return ret; }
static int s2_init(void) { int spawned, size, rank, appnum; int rc, ret = OPAL_ERROR; char buf[16]; int found; int my_node; uint32_t stepid; int i; opal_process_name_t ldr; opal_value_t kv; char **localranks; char *str; char nmtmp[64]; opal_process_name_t wildcard_rank; /* if we can't startup PMI, we can't be used */ if ( PMI2_Initialized () ) { return OPAL_SUCCESS; } size = -1; rank = -1; appnum = -1; // setup hash table so we always can finalize it opal_pmix_base_hash_init(); if (PMI2_SUCCESS != (rc = PMI2_Init(&spawned, &size, &rank, &appnum))) { opal_show_help("help-pmix-base.txt", "pmix2-init-failed", true, rc); return OPAL_ERROR; } if( size < 0 || rank < 0 ){ opal_show_help("help-pmix-base.txt", "pmix2-init-returned-bad-values", true); goto err_exit; } s2_jsize = size; s2_rank = rank; s2_appnum = appnum; pmix_vallen_max = PMI2_MAX_VALLEN; pmix_kvslen_max = PMI2_MAX_VALLEN; // FIX ME: What to put here for versatility? pmix_keylen_max = PMI2_MAX_KEYLEN; pmix_vallen_threshold = PMI2_MAX_VALLEN * 3; pmix_vallen_threshold >>= 2; pmix_kvs_name = (char*)malloc(pmix_kvslen_max); if( pmix_kvs_name == NULL ){ PMI2_Finalize(); ret = OPAL_ERR_OUT_OF_RESOURCE; goto err_exit; } rc = PMI2_Job_GetId(pmix_kvs_name, pmix_kvslen_max); if( PMI2_SUCCESS != rc ) { OPAL_PMI_ERROR(rc, "PMI2_Job_GetId"); free(pmix_kvs_name); goto err_exit; } /* store our name in the opal_proc_t so that * debug messages will make sense - an upper * layer will eventually overwrite it, but that * won't do any harm */ s2_pname.jobid = strtoul(pmix_kvs_name, &str, 10); s2_pname.jobid = (s2_pname.jobid << 16) & 0xffff0000; if (NULL != str) { stepid = strtoul(str, NULL, 10); s2_pname.jobid |= (stepid & 0x0000ffff); } s2_pname.vpid = s2_rank; opal_proc_set_name(&s2_pname); opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s2: assigned tmp name", OPAL_NAME_PRINT(s2_pname)); /* setup wildcard rank*/ wildcard_rank = OPAL_PROC_MY_NAME; wildcard_rank.vpid = OPAL_VPID_WILDCARD; /* Slurm PMI provides the job id as an integer followed * by a '.', followed by essentially a stepid. The first integer * defines an overall job number. The second integer is the number of * individual jobs we have run within that allocation. */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOBID); kv.type = OPAL_UINT32; kv.data.uint32 = s2_pname.jobid; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); // frees pmix_kvs_name /* save the job size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOB_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = size; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the appnum */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_APPNUM); kv.type = OPAL_UINT32; kv.data.uint32 = appnum; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); rc = PMI2_Info_GetJobAttr("universeSize", buf, 16, &found); if( PMI2_SUCCESS != rc ) { OPAL_PMI_ERROR(rc, "PMI_Get_universe_size"); goto err_exit; } /* save it */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_UNIV_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = atoi(buf); if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* push this into the dstore for subsequent fetches */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_MAX_PROCS); kv.type = OPAL_UINT32; kv.data.uint32 = atoi(buf); if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); char *pmapping = (char*)malloc(PMI2_MAX_VALLEN); if( pmapping == NULL ){ rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found); if( !found || PMI2_SUCCESS != rc ) { OPAL_PMI_ERROR(rc,"PMI2_Info_GetJobAttr"); return OPAL_ERROR; } s2_lranks = mca_common_pmi2_parse_pmap(pmapping, s2_pname.vpid, &my_node, &s2_nlranks); if (NULL == s2_lranks) { rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } free(pmapping); /* save the local size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = s2_nlranks; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); s2_lrank = 0; s2_nrank = 0; ldr.vpid = rank; localranks = NULL; if (0 < s2_nlranks && NULL != s2_lranks) { /* note the local ldr */ ldr.vpid = s2_lranks[0]; /* find ourselves */ ldr.jobid = s2_pname.jobid; ldr.vpid = s2_pname.vpid; memset(nmtmp, 0, 64); for (i=0; i < s2_nlranks; i++) { (void)snprintf(nmtmp, 64, "%d", s2_lranks[i]); opal_argv_append_nosize(&localranks, nmtmp); if (s2_rank == s2_lranks[i]) { s2_lrank = i; s2_nrank = i; } } str = opal_argv_join(localranks, ','); opal_argv_free(localranks); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_PEERS); kv.type = OPAL_STRING; kv.data.string = str; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); } /* save the local leader */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCALLDR); kv.type = OPAL_UINT64; kv.data.uint64 = *(uint64_t*)&ldr; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save our local rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = s2_lrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* and our node rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_NODE_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = s2_nrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* increment the init count */ ++pmix_init_count; return OPAL_SUCCESS; err_exit: PMI2_Finalize(); return ret; }