mca_btl_openib_proc_t* mca_btl_openib_proc_create(opal_proc_t* proc) { mca_btl_openib_proc_t* module_proc = NULL; size_t msg_size; uint32_t size; int rc, i, j; void *message; char *offset; int modex_message_size; mca_btl_openib_modex_message_t dummy; /* Check if we have already created a IB proc * structure for this ompi process */ module_proc = mca_btl_openib_proc_lookup_proc(proc); if (NULL != module_proc) { /* Gotcha! */ return module_proc; } /* Oops! First time, gotta create a new IB proc * out of the opal_proc ... */ module_proc = OBJ_NEW(mca_btl_openib_proc_t); /* Initialize number of peer */ module_proc->proc_endpoint_count = 0; module_proc->proc_opal = proc; /* query for the peer address info */ OPAL_MODEX_RECV(rc, &mca_btl_openib_component.super.btl_version, proc, &message, &msg_size); if (OPAL_SUCCESS != rc) { BTL_ERROR(("[%s:%d] opal_modex_recv failed for peer %s", __FILE__, __LINE__, OPAL_NAME_PRINT(proc->proc_name))); OBJ_RELEASE(module_proc); return NULL; } if (0 == msg_size) { return NULL; } /* Message was packed in btl_openib_component.c; the format is listed in a comment in that file */ modex_message_size = ((char *) &(dummy.end)) - ((char*) &dummy); /* Unpack the number of modules in the message */ offset = (char *) message; unpack8(&offset, &(module_proc->proc_port_count)); BTL_VERBOSE(("unpack: %d btls", module_proc->proc_port_count)); if (module_proc->proc_port_count > 0) { module_proc->proc_ports = (mca_btl_openib_proc_modex_t *) malloc(sizeof(mca_btl_openib_proc_modex_t) * module_proc->proc_port_count); } else { module_proc->proc_ports = NULL; } /* Loop over unpacking all the ports */ for (i = 0; i < module_proc->proc_port_count; i++) { /* Unpack the modex comment message struct */ size = modex_message_size; memcpy(&(module_proc->proc_ports[i].pm_port_info), offset, size); #if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT MCA_BTL_OPENIB_MODEX_MSG_NTOH(module_proc->proc_ports[i].pm_port_info); #endif offset += size; BTL_VERBOSE(("unpacked btl %d: modex message, offset now %d", i, (int)(offset-((char*)message)))); /* Unpack the number of CPCs that follow */ unpack8(&offset, &(module_proc->proc_ports[i].pm_cpc_data_count)); BTL_VERBOSE(("unpacked btl %d: number of cpcs to follow %d (offset now %d)", i, module_proc->proc_ports[i].pm_cpc_data_count, (int)(offset-((char*)message)))); module_proc->proc_ports[i].pm_cpc_data = (opal_btl_openib_connect_base_module_data_t *) calloc(module_proc->proc_ports[i].pm_cpc_data_count, sizeof(opal_btl_openib_connect_base_module_data_t)); if (NULL == module_proc->proc_ports[i].pm_cpc_data) { return NULL; } /* Unpack the CPCs */ for (j = 0; j < module_proc->proc_ports[i].pm_cpc_data_count; ++j) { uint8_t u8; opal_btl_openib_connect_base_module_data_t *cpcd; cpcd = module_proc->proc_ports[i].pm_cpc_data + j; unpack8(&offset, &u8); BTL_VERBOSE(("unpacked btl %d: cpc %d: index %d (offset now %d)", i, j, u8, (int)(offset-(char*)message))); cpcd->cbm_component = opal_btl_openib_connect_base_get_cpc_byindex(u8); BTL_VERBOSE(("unpacked btl %d: cpc %d: component %s", i, j, cpcd->cbm_component->cbc_name)); unpack8(&offset, &cpcd->cbm_priority); unpack8(&offset, &cpcd->cbm_modex_message_len); BTL_VERBOSE(("unpacked btl %d: cpc %d: priority %d, msg len %d (offset now %d)", i, j, cpcd->cbm_priority, cpcd->cbm_modex_message_len, (int)(offset-(char*)message))); if (cpcd->cbm_modex_message_len > 0) { cpcd->cbm_modex_message = malloc(cpcd->cbm_modex_message_len); if (NULL == cpcd->cbm_modex_message) { BTL_ERROR(("Failed to malloc")); return NULL; } memcpy(cpcd->cbm_modex_message, offset, cpcd->cbm_modex_message_len); offset += cpcd->cbm_modex_message_len; BTL_VERBOSE(("unpacked btl %d: cpc %d: blob unpacked %d %x (offset now %d)", i, j, ((uint32_t*)cpcd->cbm_modex_message)[0], ((uint32_t*)cpcd->cbm_modex_message)[1], (int)(offset-((char*)message)))); } } } if (0 == module_proc->proc_port_count) { module_proc->proc_endpoints = NULL; } else { module_proc->proc_endpoints = (mca_btl_base_endpoint_t**) malloc(module_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*)); } if (NULL == module_proc->proc_endpoints) { OBJ_RELEASE(module_proc); return NULL; } BTL_VERBOSE(("unpacking done!")); return module_proc; }
int mca_btl_portals4_add_procs(struct mca_btl_base_module_t* btl_base, size_t nprocs, struct opal_proc_t **procs, struct mca_btl_base_endpoint_t** btl_peer_data, opal_bitmap_t* reachable) { struct mca_btl_portals4_module_t* portals4_btl = (struct mca_btl_portals4_module_t*) btl_base; int ret; size_t i; bool need_activate = false; opal_output_verbose(50, opal_btl_base_framework.framework_output, "mca_btl_portals4_add_procs: Adding %d procs (%d) for NI %d", (int) nprocs, (int) portals4_btl->portals_num_procs, portals4_btl->interface_num); if (0 == portals4_btl->portals_num_procs) { need_activate = true; } /* * The PML handed us a list of procs that need Portals4 * peer info. Complete those procs here. */ for (i = 0 ; i < nprocs ; ++i) { struct opal_proc_t *curr_proc = procs[i]; /* portals doesn't support heterogeneous yet... */ if (opal_proc_local_get()->proc_arch != curr_proc->proc_arch) { opal_output_verbose(1, opal_btl_base_framework.framework_output, "Portals 4 BTL does not support heterogeneous operations."); opal_output_verbose(1, opal_btl_base_framework.framework_output, "Proc %s architecture %x, mine %x.", OPAL_NAME_PRINT(curr_proc->proc_name), curr_proc->proc_arch, opal_proc_local_get()->proc_arch); return OPAL_ERR_NOT_SUPPORTED; } ret = create_endpoint(portals4_btl->interface_num, curr_proc, &btl_peer_data[i]); OPAL_THREAD_ADD32(&portals4_btl->portals_num_procs, 1); /* and here we can reach */ opal_bitmap_set_bit(reachable, i); OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "add_procs: rank=%lx nid=%x pid=%x for NI %d", i, btl_peer_data[i]->ptl_proc.phys.nid, btl_peer_data[i]->ptl_proc.phys.pid, portals4_btl->interface_num)); } if (need_activate && portals4_btl->portals_num_procs > 0) { if (mca_btl_portals4_component.use_logical) { ret = create_maptable(portals4_btl, nprocs, procs, btl_peer_data); if (OPAL_SUCCESS != ret) { opal_output_verbose(1, opal_btl_base_framework.framework_output, "%s:%d: mca_btl_portals4_add_procs::create_maptable() failed: %d\n", __FILE__, __LINE__, ret); return ret; } } ret = btl_portals4_init_interface(); if (OPAL_SUCCESS != ret) { opal_output_verbose(1, opal_btl_base_framework.framework_output, "%s:%d: portals4 interface initialization failed: %d", __FILE__, __LINE__, ret); return ret; } } return OPAL_SUCCESS; }
static int isolated_init(void) { int rc; opal_value_t kv; ++isolated_init_count; /* store our name in the opal_proc_t so that * debug messages will make sense - an upper * layer will eventually overwrite it, but that * won't do any harm */ isolated_pname.jobid = 1; isolated_pname.vpid = 0; opal_proc_set_name(&isolated_pname); opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:isolated: assigned tmp name %d %d", OPAL_NAME_PRINT(isolated_pname),isolated_pname.jobid,isolated_pname.vpid); // setup hash table opal_pmix_base_hash_init(); /* save the job size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOB_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = 1; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the appnum */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_APPNUM); kv.type = OPAL_UINT32; kv.data.uint32 = 0; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_UNIV_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = 1; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOBID); kv.type = OPAL_UINT32; kv.data.uint32 = 1; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the local size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = 1; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_PEERS); kv.type = OPAL_STRING; kv.data.string = strdup("0"); if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the local leader */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCALLDR); kv.type = OPAL_UINT64; kv.data.uint64 = 0; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } /* save our local rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = 0; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } /* and our node rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_NODE_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = 0; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); return OPAL_SUCCESS; err_exit: return rc; }
static int s1_init(void) { PMI_BOOL initialized; int spawned; int rc, ret = OPAL_ERROR; int i, rank, lrank, nrank; char *pmix_id, tmp[64]; opal_value_t kv; char *str; uint32_t ui32; opal_process_name_t ldr; char **localranks=NULL; opal_process_name_t wildcard_rank; if (PMI_SUCCESS != (rc = PMI_Initialized(&initialized))) { OPAL_PMI_ERROR(rc, "PMI_Initialized"); return OPAL_ERROR; } if (PMI_TRUE != initialized && PMI_SUCCESS != (rc = PMI_Init(&spawned))) { OPAL_PMI_ERROR(rc, "PMI_Init"); return OPAL_ERROR; } // setup hash table opal_pmix_base_hash_init(); // Initialize space demands rc = PMI_KVS_Get_value_length_max(&pmix_vallen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_value_length_max"); goto err_exit; } pmix_vallen_threshold = pmix_vallen_max * 3; pmix_vallen_threshold >>= 2; rc = PMI_KVS_Get_name_length_max(&pmix_kvslen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_name_length_max"); goto err_exit; } rc = PMI_KVS_Get_key_length_max(&pmix_keylen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_key_length_max"); goto err_exit; } // Initialize job environment information pmix_id = (char*)malloc(pmix_vallen_max); if (pmix_id == NULL) { ret = OPAL_ERR_OUT_OF_RESOURCE; goto err_exit; } /* Get domain id */ if (PMI_SUCCESS != (rc = PMI_Get_kvs_domain_id(pmix_id, pmix_vallen_max))) { free(pmix_id); goto err_exit; } /* get our rank */ ret = PMI_Get_rank(&rank); if( PMI_SUCCESS != ret ) { OPAL_PMI_ERROR(ret, "PMI_Get_rank"); free(pmix_id); goto err_exit; } /* Slurm PMI provides the job id as an integer followed * by a '.', followed by essentially a stepid. The first integer * defines an overall job number. The second integer is the number of * individual jobs we have run within that allocation. */ s1_pname.jobid = strtoul(pmix_id, &str, 10); s1_pname.jobid = (s1_pname.jobid << 16) & 0xffff0000; if (NULL != str) { ui32 = strtoul(str, NULL, 10); s1_pname.jobid |= (ui32 & 0x0000ffff); } ldr.jobid = s1_pname.jobid; s1_pname.vpid = rank; /* store our name in the opal_proc_t so that * debug messages will make sense - an upper * layer will eventually overwrite it, but that * won't do any harm */ opal_proc_set_name(&s1_pname); opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s1: assigned tmp name", OPAL_NAME_PRINT(s1_pname)); /* setup wildcard rank*/ wildcard_rank = OPAL_PROC_MY_NAME; wildcard_rank.vpid = OPAL_VPID_WILDCARD; OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOBID); kv.type = OPAL_UINT32; kv.data.uint32 = s1_pname.jobid; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save it */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_RANK); kv.type = OPAL_UINT32; kv.data.uint32 = rank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); pmix_kvs_name = (char*)malloc(pmix_kvslen_max); if (pmix_kvs_name == NULL) { ret = OPAL_ERR_OUT_OF_RESOURCE; goto err_exit; } rc = PMI_KVS_Get_my_name(pmix_kvs_name, pmix_kvslen_max); if (PMI_SUCCESS != rc) { OPAL_PMI_ERROR(rc, "PMI_KVS_Get_my_name"); goto err_exit; } /* get our local proc info to find our local rank */ if (PMI_SUCCESS != (rc = PMI_Get_clique_size(&nlranks))) { OPAL_PMI_ERROR(rc, "PMI_Get_clique_size"); return rc; } /* save the local size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = nlranks; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); lrank = 0; nrank = 0; ldr.vpid = rank; if (0 < nlranks) { /* now get the specific ranks */ lranks = (int*)calloc(nlranks, sizeof(int)); if (NULL == lranks) { rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } if (PMI_SUCCESS != (rc = PMI_Get_clique_ranks(lranks, nlranks))) { OPAL_PMI_ERROR(rc, "PMI_Get_clique_ranks"); free(lranks); return rc; } /* note the local ldr */ ldr.vpid = lranks[0]; /* save this */ memset(tmp, 0, 64); for (i=0; i < nlranks; i++) { (void)snprintf(tmp, 64, "%d", lranks[i]); opal_argv_append_nosize(&localranks, tmp); if (rank == lranks[i]) { lrank = i; nrank = i; } } str = opal_argv_join(localranks, ','); opal_argv_free(localranks); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_PEERS); kv.type = OPAL_STRING; kv.data.string = str; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); } /* save the local leader */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCALLDR); kv.type = OPAL_UINT64; kv.data.uint64 = *(uint64_t*)&ldr; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save our local rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = lrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* and our node rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_NODE_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = nrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* get universe size */ ret = PMI_Get_universe_size(&i); if (PMI_SUCCESS != ret) { OPAL_PMI_ERROR(ret, "PMI_Get_universe_size"); goto err_exit; } /* push this into the dstore for subsequent fetches */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_UNIV_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = i; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* push this into the dstore for subsequent fetches */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_MAX_PROCS); kv.type = OPAL_UINT32; kv.data.uint32 = i; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* get job size */ ret = PMI_Get_size(&i); if (PMI_SUCCESS != ret) { OPAL_PMI_ERROR(ret, "PMI_Get_size"); goto err_exit; } OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOB_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = i; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* get appnum */ ret = PMI_Get_appnum(&i); if (PMI_SUCCESS != ret) { OPAL_PMI_ERROR(ret, "PMI_Get_appnum"); goto err_exit; } OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_APPNUM); kv.type = OPAL_UINT32; kv.data.uint32 = i; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* increment the init count */ ++pmix_init_count; return OPAL_SUCCESS; err_exit: PMI_Finalize(); return ret; }
static void fencenb(int sd, short args, void *cbdata) { pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata; int rc = OPAL_SUCCESS; int32_t i; opal_value_t *kp, kvn; opal_hwloc_locality_t locality; opal_process_name_t s1_pname; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s1 called fence", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* use the PMI barrier function */ if (PMI_SUCCESS != (rc = PMI_Barrier())) { OPAL_PMI_ERROR(rc, "PMI_Barrier"); rc = OPAL_ERROR; goto cleanup; } opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s1 barrier complete", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* get the modex data from each local process and set the * localities to avoid having the MPI layer fetch data * for every process in the job */ s1_pname.jobid = OPAL_PROC_MY_NAME.jobid; if (!got_modex_data) { got_modex_data = true; /* we only need to set locality for each local rank as "not found" * equates to "non-local" */ for (i=0; i < nlranks; i++) { s1_pname.vpid = lranks[i]; rc = opal_pmix_base_cache_keys_locally(&s1_pname, OPAL_PMIX_CPUSET, &kp, pmix_kvs_name, pmix_vallen_max, kvs_get); if (OPAL_SUCCESS != rc) { OPAL_ERROR_LOG(rc); goto cleanup; } if (NULL == kp || NULL == kp->data.string) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, opal_process_info.cpuset, kp->data.string); } if (NULL != kp) { OBJ_RELEASE(kp); } OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:s1 proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(s1_pname), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_PMIX_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; opal_pmix_base_store(&s1_pname, &kvn); OBJ_DESTRUCT(&kvn); } } cleanup: if (NULL != op->opcbfunc) { op->opcbfunc(rc, op->cbdata); } OBJ_RELEASE(op); return; }
int opal_pmix_base_cache_keys_locally(const opal_process_name_t* id, const char* key, opal_value_t **out_kv, char* kvs_name, int vallen, kvs_get_fn fn) { char *tmp, *tmp2, *tmp3, *tmp_val; opal_data_type_t stored_type; size_t len, offset; int rc, size; opal_value_t *kv, *knew; opal_list_t values; /* set the default */ *out_kv = NULL; /* first try to fetch data from data storage */ OBJ_CONSTRUCT(&values, opal_list_t); rc = opal_pmix_base_fetch(id, key, &values); if (OPAL_SUCCESS == rc) { kv = (opal_value_t*)opal_list_get_first(&values); /* create the copy */ if (OPAL_SUCCESS != (rc = opal_dss.copy((void**)&knew, kv, OPAL_VALUE))) { OPAL_ERROR_LOG(rc); } else { *out_kv = knew; } OPAL_LIST_DESTRUCT(&values); return rc; } OPAL_LIST_DESTRUCT(&values); OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "pmix: get all keys for proc %s in KVS %s", OPAL_NAME_PRINT(*id), kvs_name)); rc = opal_pmix_base_get_packed(id, &tmp_val, &len, vallen, fn); if (OPAL_SUCCESS != rc) { return rc; } /* search for each key in the decoded data */ for (offset = 0 ; offset < len && '\0' != tmp_val[offset] ; ) { /* type */ tmp = tmp_val + offset + strlen (tmp_val + offset) + 1; /* size */ tmp2 = tmp + strlen (tmp) + 1; /* data */ tmp3 = tmp2 + strlen (tmp2) + 1; stored_type = (opal_data_type_t) strtol (tmp, NULL, 16); size = strtol (tmp2, NULL, 16); /* cache value locally so we don't have to look it up via pmi again */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(tmp_val + offset); kv->type = stored_type; switch (stored_type) { case OPAL_BYTE: kv->data.byte = *tmp3; break; case OPAL_STRING: kv->data.string = strdup(tmp3); break; case OPAL_PID: kv->data.pid = strtoul(tmp3, NULL, 10); break; case OPAL_INT: kv->data.integer = strtol(tmp3, NULL, 10); break; case OPAL_INT8: kv->data.int8 = strtol(tmp3, NULL, 10); break; case OPAL_INT16: kv->data.int16 = strtol(tmp3, NULL, 10); break; case OPAL_INT32: kv->data.int32 = strtol(tmp3, NULL, 10); break; case OPAL_INT64: kv->data.int64 = strtol(tmp3, NULL, 10); break; case OPAL_UINT: kv->data.uint = strtoul(tmp3, NULL, 10); break; case OPAL_UINT8: kv->data.uint8 = strtoul(tmp3, NULL, 10); break; case OPAL_UINT16: kv->data.uint16 = strtoul(tmp3, NULL, 10); break; case OPAL_UINT32: kv->data.uint32 = strtoul(tmp3, NULL, 10); break; case OPAL_UINT64: kv->data.uint64 = strtoull(tmp3, NULL, 10); break; case OPAL_BYTE_OBJECT: if (size == 0xffff) { kv->data.bo.bytes = NULL; kv->data.bo.size = 0; size = 0; } else { kv->data.bo.bytes = malloc(size); memcpy(kv->data.bo.bytes, tmp3, size); kv->data.bo.size = size; } break; default: opal_output(0, "UNSUPPORTED TYPE %d", stored_type); return OPAL_ERROR; } /* store data in local hash table */ if (OPAL_SUCCESS != (rc = opal_pmix_base_store(id, kv))) { OPAL_ERROR_LOG(rc); } /* keep going and cache everything locally */ offset = (size_t) (tmp3 - tmp_val) + size; if (0 == strcmp(kv->key, key)) { /* create the copy */ if (OPAL_SUCCESS != (rc = opal_dss.copy((void**)&knew, kv, OPAL_VALUE))) { OPAL_ERROR_LOG(rc); } else { *out_kv = knew; } } } free (tmp_val); /* if there was no issue with unpacking the message, but * we didn't find the requested info, then indicate that * the info wasn't found */ if (OPAL_SUCCESS == rc && NULL == *out_kv) { return OPAL_ERR_NOT_FOUND; } return rc; }
static int s2_init(void) { int spawned, size, rank, appnum; int rc, ret = OPAL_ERROR; char buf[16]; int found; int my_node; uint32_t stepid; int i; opal_process_name_t ldr; opal_value_t kv; char **localranks; char *str; char nmtmp[64]; opal_process_name_t wildcard_rank; /* if we can't startup PMI, we can't be used */ if ( PMI2_Initialized () ) { return OPAL_SUCCESS; } size = -1; rank = -1; appnum = -1; // setup hash table so we always can finalize it opal_pmix_base_hash_init(); if (PMI2_SUCCESS != (rc = PMI2_Init(&spawned, &size, &rank, &appnum))) { opal_show_help("help-pmix-base.txt", "pmix2-init-failed", true, rc); return OPAL_ERROR; } if( size < 0 || rank < 0 ){ opal_show_help("help-pmix-base.txt", "pmix2-init-returned-bad-values", true); goto err_exit; } s2_jsize = size; s2_rank = rank; s2_appnum = appnum; pmix_vallen_max = PMI2_MAX_VALLEN; pmix_kvslen_max = PMI2_MAX_VALLEN; // FIX ME: What to put here for versatility? pmix_keylen_max = PMI2_MAX_KEYLEN; pmix_vallen_threshold = PMI2_MAX_VALLEN * 3; pmix_vallen_threshold >>= 2; pmix_kvs_name = (char*)malloc(pmix_kvslen_max); if( pmix_kvs_name == NULL ){ PMI2_Finalize(); ret = OPAL_ERR_OUT_OF_RESOURCE; goto err_exit; } rc = PMI2_Job_GetId(pmix_kvs_name, pmix_kvslen_max); if( PMI2_SUCCESS != rc ) { OPAL_PMI_ERROR(rc, "PMI2_Job_GetId"); free(pmix_kvs_name); goto err_exit; } /* store our name in the opal_proc_t so that * debug messages will make sense - an upper * layer will eventually overwrite it, but that * won't do any harm */ s2_pname.jobid = strtoul(pmix_kvs_name, &str, 10); s2_pname.jobid = (s2_pname.jobid << 16) & 0xffff0000; if (NULL != str) { stepid = strtoul(str, NULL, 10); s2_pname.jobid |= (stepid & 0x0000ffff); } s2_pname.vpid = s2_rank; opal_proc_set_name(&s2_pname); opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s pmix:s2: assigned tmp name", OPAL_NAME_PRINT(s2_pname)); /* setup wildcard rank*/ wildcard_rank = OPAL_PROC_MY_NAME; wildcard_rank.vpid = OPAL_VPID_WILDCARD; /* Slurm PMI provides the job id as an integer followed * by a '.', followed by essentially a stepid. The first integer * defines an overall job number. The second integer is the number of * individual jobs we have run within that allocation. */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOBID); kv.type = OPAL_UINT32; kv.data.uint32 = s2_pname.jobid; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); // frees pmix_kvs_name /* save the job size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_JOB_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = size; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save the appnum */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_APPNUM); kv.type = OPAL_UINT32; kv.data.uint32 = appnum; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); rc = PMI2_Info_GetJobAttr("universeSize", buf, 16, &found); if( PMI2_SUCCESS != rc ) { OPAL_PMI_ERROR(rc, "PMI_Get_universe_size"); goto err_exit; } /* save it */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_UNIV_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = atoi(buf); if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* push this into the dstore for subsequent fetches */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_MAX_PROCS); kv.type = OPAL_UINT32; kv.data.uint32 = atoi(buf); if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); char *pmapping = (char*)malloc(PMI2_MAX_VALLEN); if( pmapping == NULL ){ rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found); if( !found || PMI2_SUCCESS != rc ) { OPAL_PMI_ERROR(rc,"PMI2_Info_GetJobAttr"); return OPAL_ERROR; } s2_lranks = mca_common_pmi2_parse_pmap(pmapping, s2_pname.vpid, &my_node, &s2_nlranks); if (NULL == s2_lranks) { rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } free(pmapping); /* save the local size */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_SIZE); kv.type = OPAL_UINT32; kv.data.uint32 = s2_nlranks; if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); s2_lrank = 0; s2_nrank = 0; ldr.vpid = rank; localranks = NULL; if (0 < s2_nlranks && NULL != s2_lranks) { /* note the local ldr */ ldr.vpid = s2_lranks[0]; /* find ourselves */ ldr.jobid = s2_pname.jobid; ldr.vpid = s2_pname.vpid; memset(nmtmp, 0, 64); for (i=0; i < s2_nlranks; i++) { (void)snprintf(nmtmp, 64, "%d", s2_lranks[i]); opal_argv_append_nosize(&localranks, nmtmp); if (s2_rank == s2_lranks[i]) { s2_lrank = i; s2_nrank = i; } } str = opal_argv_join(localranks, ','); opal_argv_free(localranks); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_PEERS); kv.type = OPAL_STRING; kv.data.string = str; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&wildcard_rank, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); } /* save the local leader */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCALLDR); kv.type = OPAL_UINT64; kv.data.uint64 = *(uint64_t*)&ldr; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* save our local rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_LOCAL_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = s2_lrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* and our node rank */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_NODE_RANK); kv.type = OPAL_UINT16; kv.data.uint16 = s2_nrank; if (OPAL_SUCCESS != (ret = opal_pmix_base_store(&OPAL_PROC_MY_NAME, &kv))) { OPAL_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); goto err_exit; } OBJ_DESTRUCT(&kv); /* increment the init count */ ++pmix_init_count; return OPAL_SUCCESS; err_exit: PMI2_Finalize(); return ret; }
static int cray_fence(opal_process_name_t *procs, size_t nprocs) { int rc; int32_t i; opal_value_t *kp, kvn; opal_hwloc_locality_t locality; opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray called fence", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* check if there is partially filled meta key and put them */ if (0 != pmix_packed_data_offset && NULL != pmix_packed_data) { opal_pmix_base_commit_packed(pmix_packed_data, pmix_packed_data_offset, pmix_vallen_max, &pmix_pack_key, kvs_put); pmix_packed_data_offset = 0; free(pmix_packed_data); pmix_packed_data = NULL; } if (PMI_SUCCESS != (rc = PMI2_KVS_Fence())) { OPAL_PMI_ERROR(rc, "PMI2_KVS_Fence"); return OPAL_ERROR; } opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray kvs_fence complete", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* get the modex data from each local process and set the * localities to avoid having the MPI layer fetch data * for every process in the job */ if (!pmix_got_modex_data) { pmix_got_modex_data = true; /* we only need to set locality for each local rank as "not found" * equates to "non-local" */ for (i=0; i < pmix_nlranks; i++) { pmix_pname.vid = pmix_lranks[i]; rc = opal_pmix_base_cache_keys_locally((opal_identifier_t*)&pmix_pname, OPAL_DSTORE_CPUSET, &kp, pmix_kvs_name, pmix_vallen_max, kvs_get); if (OPAL_SUCCESS != rc) { OPAL_ERROR_LOG(rc); return rc; } #if OPAL_HAVE_HWLOC if (NULL == kp || NULL == kp->data.string) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, opal_process_info.cpuset, kp->data.string); } if (NULL != kp) { OBJ_RELEASE(kp); } #else /* all we know is we share a node */ locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; #endif OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output, "%s pmix:s2 proc %s locality %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(*(opal_identifier_t*)&pmix_pname), opal_hwloc_base_print_locality(locality))); OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_LOCALITY); kvn.type = OPAL_UINT16; kvn.data.uint16 = locality; (void)opal_dstore.store(opal_dstore_internal, (opal_identifier_t*)&pmix_pname, &kvn); OBJ_DESTRUCT(&kvn); } } return OPAL_SUCCESS; }
static int cray_init(void) { int i, spawned, size, rank, appnum, my_node; int rc, ret = OPAL_ERROR; char *pmapping = NULL; char buf[PMI2_MAX_ATTRVALUE]; int found; uint32_t jobfam; ++pmix_init_count; /* if we can't startup PMI, we can't be used */ if ( PMI2_Initialized () ) { return OPAL_SUCCESS; } size = -1; rank = -1; appnum = -1; if (PMI_SUCCESS != (rc = PMI2_Init(&spawned, &size, &rank, &appnum))) { opal_show_help("help-pmix-base.txt", "pmix2-init-failed", true, rc); return OPAL_ERROR; } if( size < 0 || rank < 0 ){ opal_show_help("help-pmix-base.txt", "pmix2-init-returned-bad-values", true); goto err_exit; } pmix_size = size; pmix_rank = rank; pmix_appnum = appnum; pmix_vallen_max = PMI2_MAX_VALLEN; pmix_kvslen_max = PMI2_MAX_VALLEN; // FIX ME: What to put here for versatility? pmix_keylen_max = PMI2_MAX_KEYLEN; rc = PMI2_Info_GetJobAttr("universeSize", buf, 16, &found); if( PMI_SUCCESS != rc ) { OPAL_PMI_ERROR(rc, "PMI_Get_universe_size"); goto err_exit; } pmix_usize = atoi(buf); pmix_kvs_name = (char*)malloc(pmix_kvslen_max); if( pmix_kvs_name == NULL ){ PMI2_Finalize(); ret = OPAL_ERR_OUT_OF_RESOURCE; goto err_exit; } rc = PMI2_Job_GetId(pmix_kvs_name, pmix_kvslen_max); if( PMI_SUCCESS != rc ) { OPAL_PMI_ERROR(rc, "PMI2_Job_GetId"); goto err_exit; } rc = sscanf(pmix_kvs_name,"kvs_%u",&jobfam); if (rc != 1) { OPAL_PMI_ERROR(rc, "PMI2_Job_GetId"); rc = OPAL_ERROR; goto err_exit; } pmix_jobid = jobfam << 16; /* store our name in the opal_proc_t so that * debug messages will make sense - an upper * layer will eventually overwrite it, but that * won't do any harm */ pmix_pname.jid = pmix_jobid; pmix_pname.vid = pmix_rank; opal_proc_set_name((opal_process_name_t*)&pmix_pname); opal_output_verbose(10, opal_pmix_base_framework.framework_output, "%s pmix:cray: assigned tmp name %d %d pmix_kvs_name %s", OPAL_NAME_PRINT(*(opal_process_name_t*)&pmix_pname),pmix_pname.jid,pmix_pname.vid,pmix_kvs_name); pmapping = (char*)malloc(PMI2_MAX_VALLEN); if( pmapping == NULL ){ rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } rc = PMI2_Info_GetJobAttr("PMI_process_mapping", pmapping, PMI2_MAX_VALLEN, &found); if( !found || PMI_SUCCESS != rc ) { OPAL_PMI_ERROR(rc,"PMI2_Info_GetJobAttr"); return OPAL_ERROR; } pmix_lranks = pmix_cray_parse_pmap(pmapping, pmix_rank, &my_node, &pmix_nlranks); if (NULL == pmix_lranks) { rc = OPAL_ERR_OUT_OF_RESOURCE; OPAL_ERROR_LOG(rc); return rc; } free(pmapping); /* find ourselves */ for (i=0; i < pmix_nlranks; i++) { if (pmix_rank == pmix_lranks[i]) { pmix_lrank = i; pmix_nrank = my_node; break; } } return OPAL_SUCCESS; err_exit: PMI2_Finalize(); return ret; }
int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device) { mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules; mca_btl_base_endpoint_t *ep; gni_ep_handle_t handle; int count = 0, rc; rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep); if (1 != rc) { return rc; } BTL_VERBOSE(("remote datagram completion on handle %p", (void*)handle)); /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */ if (handle == ugni_module->wildcard_ep) { struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name); BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s", OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name))); ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc); if (OPAL_UNLIKELY(NULL == ep)) { /* there is no way to recover from this error so just abort() */ BTL_ERROR(("could not find/allocate a btl endpoint for peer %s", OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name))); abort (); return OPAL_ERR_NOT_FOUND; } } /* should not have gotten a NULL endpoint */ assert (NULL != ep); BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep)); /* NTH: TODO -- error handling */ opal_mutex_lock (&ep->lock); if (handle != ugni_module->wildcard_ep) { /* directed post complete */ BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep)); ep->dg_posted = false; (void) opal_atomic_add_32 (&ugni_module->active_datagrams, -1); } (void) mca_btl_ugni_ep_connect_progress (ep); opal_mutex_unlock (&ep->lock); if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) { /* process messages waiting in the endpoint's smsg mailbox */ count = mca_btl_ugni_smsg_process (ep); } /* repost the wildcard datagram */ if (handle == ugni_module->wildcard_ep) { mca_btl_ugni_wildcard_ep_post (ugni_module); } return count; }