int orte_grpcomm_base_peer_modex(bool modex_db) { opal_buffer_t buf, rbuf; int rc = ORTE_SUCCESS; bool modex_reqd; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:base:peer:modex: performing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the buffer that will actually be sent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&rbuf, opal_buffer_t); /* put our process name in the buffer so it can be unpacked later */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the entries we have received */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf, &modex_reqd))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:base:peer:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* exchange the buffer with my peers */ if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:base:peer:modex: processing modex info", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_grpcomm_base_modex_unpack(&rbuf, modex_db)) ) { ORTE_ERROR_LOG(rc); goto cleanup; } cleanup: OBJ_DESTRUCT(&buf); OBJ_DESTRUCT(&rbuf); return rc; }
/*** MODEX SECTION ***/ static int modex(opal_list_t *procs) { opal_buffer_t buf, rbuf; orte_std_cntr_t i, num_procs; orte_std_cntr_t cnt; orte_process_name_t proc_name; int rc; int32_t arch; bool modex_reqd = false; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:basic: modex entered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the buffer that will actually be sent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&rbuf, opal_buffer_t); /* put our process name in the buffer so it can be unpacked later */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* decide if we need to add the architecture to the modex. Check * first to see if hetero is enabled - if not, then we clearly * don't need to exchange arch's as they are all identical */ if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) { /* Case 1: If different apps in this job were built differently - e.g., some * are built 32-bit while others are built 64-bit - then we need to modex * regardless of any other consideration. The user is reqd to tell us via a * cmd line option if this situation exists, which will result in an mca param * being set for us, so all we need to do is check for the global boolean * that corresponds to that param * * Case 2: the nodes are hetero, but the app binaries were built * the same - i.e., either they are both 32-bit, or they are both 64-bit, but * no mixing of the two. In this case, we include the info in the modex */ if (orte_hetero_apps || !orte_homogeneous_nodes) { OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:basic: modex is required", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); modex_reqd = true; } } if (modex_reqd) { if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.arch, 1, OPAL_UINT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } } /* pack the entries we have received */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf, &modex_reqd))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (modex_reqd) { OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:basic:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* exchange the buffer with the list of peers (if provided) or all my peers */ if (NULL == procs) { if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(&buf, &rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } } else { if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:basic:modex: processing modex info", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* process the results */ /* extract the number of procs that put data in the buffer */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_procs, &cnt, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:basic:modex: received %ld data bytes from %ld procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)(rbuf.pack_ptr - rbuf.unpack_ptr), (long)num_procs)); /* if the buffer doesn't have any more data, ignore it */ if (0 >= (rbuf.pack_ptr - rbuf.unpack_ptr)) { goto cleanup; } /* otherwise, process it */ for (i=0; i < num_procs; i++) { /* unpack the process name */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &proc_name, &cnt, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (OMPI_ENABLE_HETEROGENEOUS_SUPPORT) { /* are the nodes hetero? */ if (orte_homogeneous_nodes) { goto unpack_entries; } /* unpack its architecture */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &arch, &cnt, OPAL_UINT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* update the arch in the ESS */ if (ORTE_SUCCESS != (rc = orte_ess.update_arch(&proc_name, arch))) { ORTE_ERROR_LOG(rc); goto cleanup; } } unpack_entries: /* update the modex database */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, &rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } cleanup: OBJ_DESTRUCT(&buf); OBJ_DESTRUCT(&rbuf); OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:basic: modex completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return rc; }
/*************** MODEX SECTION **************/ void orte_grpcomm_base_modex(int fd, short args, void *cbdata) { orte_grpcomm_caddy_t *caddy = (orte_grpcomm_caddy_t*)cbdata; orte_grpcomm_collective_t *modex = caddy->op; int rc; orte_namelist_t *nm; opal_list_item_t *item; bool found; orte_grpcomm_collective_t *cptr; opal_scope_t scope; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:modex: performing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are a singleton and routing isn't enabled, * then we have nobody with which to communicate, so * we can just declare success */ if ((orte_process_info.proc_type & ORTE_PROC_SINGLETON) && !orte_routing_is_enabled) { if (NULL != modex->cbfunc) { OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s CALLING MODEX RELEASE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); modex->cbfunc(NULL, modex->cbdata); } /* flag the collective as complete */ modex->active = false; return; } if (0 == opal_list_get_size(&modex->participants)) { /* record the collective */ modex->next_cbdata = modex; opal_list_append(&orte_grpcomm_base.active_colls, &modex->super); /* put our process name in the buffer so it can be unpacked later */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* this is between our peers, so only collect info marked for them */ scope = OPAL_SCOPE_PEER; /* add a wildcard name to the participants so the daemon knows * the jobid that is involved in this collective */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = ORTE_PROC_MY_NAME->jobid; nm->name.vpid = ORTE_VPID_WILDCARD; opal_list_append(&modex->participants, &nm->super); modex->next_cb = orte_grpcomm_base_store_modex; } else { /* see if the collective is already present - a race condition * exists where other participants may have already sent us their * contribution. This would place the collective on the global * array, but leave it marked as "inactive" until we call * modex with the list of participants */ found = false; for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); item != opal_list_get_end(&orte_grpcomm_base.active_colls); item = opal_list_get_next(item)) { cptr = (orte_grpcomm_collective_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s CHECKING COLL id %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cptr->id)); if (modex->id == cptr->id) { found = true; /* remove the old entry - we will replace it * with the modex one */ opal_list_remove_item(&orte_grpcomm_base.active_colls, item); break; } } if (found) { /* since it already exists, the list of * targets contains the list of procs * that have already sent us their info. Cycle * thru the targets and move those entries to * the modex object */ while (NULL != (item = opal_list_remove_first(&cptr->targets))) { opal_list_append(&modex->targets, item); } /* copy the previously-saved data across */ opal_dss.copy_payload(&modex->local_bucket, &cptr->local_bucket); /* cleanup */ OBJ_RELEASE(cptr); } /* now add the modex to the global list of active collectives */ modex->next_cb = orte_grpcomm_base_store_modex; modex->next_cbdata = modex; opal_list_append(&orte_grpcomm_base.active_colls, &modex->super); /* pack the collective id */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &modex->id, 1, ORTE_GRPCOMM_COLL_ID_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our name */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* this is not amongst our peers, but rather between a select * group of processes - e.g., during a connect/accept operation. * Thus, we need to include the non-peer info as well as our peers * since we can't tell what the other participants may already have */ scope = OPAL_SCOPE_GLOBAL; } /* pack the requested entries */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&modex->buffer, scope))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:full:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* execute the allgather */ if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(modex))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:modex: modex posted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; cleanup: return; }
/*************** MODEX SECTION **************/ int orte_grpcomm_base_full_modex(opal_list_t *procs, bool modex_db) { opal_buffer_t buf, rbuf; int32_t i, n, num_procs; orte_std_cntr_t cnt, j, num_recvd_entries; orte_process_name_t proc_name; int rc=ORTE_SUCCESS; bool modex_reqd; orte_nid_t *nid; orte_local_rank_t local_rank; orte_node_rank_t node_rank; orte_jmap_t *jmap; orte_pmap_t *pmap; orte_vpid_t daemon; char *hostname; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:base:full:modex: performing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the buffer that will actually be sent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); OBJ_CONSTRUCT(&rbuf, opal_buffer_t); /* put our process name in the buffer so it can be unpacked later */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our hostname */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our daemon's vpid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our node rank */ node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node_rank, 1, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our local rank */ local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &local_rank, 1, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the entries we have received */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf, &modex_reqd))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:base:full:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* exchange the buffer with the list of peers */ if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:base:full:modex: processing modex info", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* extract the number of procs that put data in the buffer */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_procs, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:base:full:modex: received %ld data bytes from %d procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)(rbuf.pack_ptr - rbuf.unpack_ptr), num_procs)); /* if the buffer doesn't have any more data, ignore it */ if (0 >= (rbuf.pack_ptr - rbuf.unpack_ptr)) { goto cleanup; } /* otherwise, process it */ for (i=0; i < num_procs; i++) { /* unpack the process name */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &proc_name, &cnt, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the hostname */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &hostname, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the daemon vpid */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &daemon, &cnt, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the node rank */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* unpack the local rank */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* UPDATE THE NIDMAP/PIDMAP TO SUPPORT DYNAMIC OPERATIONS */ /* find this proc's node in the nidmap */ nid = NULL; for (n=0; NULL != (nid = (orte_nid_t *) opal_pointer_array_get_item(&orte_nidmap, n)); n++) { if (0 == strcmp(hostname, nid->name)) { break; } } if (NULL == nid) { /* node wasn't found - let's add it */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:base:full:modex no nidmap entry for node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname)); nid = OBJ_NEW(orte_nid_t); nid->name = strdup(hostname); nid->daemon = daemon; nid->index = opal_pointer_array_add(&orte_nidmap, nid); } /* see if we have this job in a jobmap */ if (NULL == (jmap = orte_util_lookup_jmap(proc_name.jobid))) { /* proc wasn't found - let's add it */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:base:full:modex no jobmap entry for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(proc_name.jobid))); jmap = OBJ_NEW(orte_jmap_t); jmap->job = proc_name.jobid; /* unfortunately, job objects cannot be stored * by index number as the jobid is a constructed * value. So we have to just add it to the end * of the array */ opal_pointer_array_add(&orte_jobmap, jmap); jmap->num_procs = 1; /* have to add the pidmap entry too, but this * can be done at the specific site corresponding * to the proc's vpid */ pmap = OBJ_NEW(orte_pmap_t); pmap->node = nid->index; pmap->local_rank = local_rank; pmap->node_rank = node_rank; opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap); } else { /* see if we have this proc in a pidmap */ if (NULL == orte_util_lookup_pmap(&proc_name)) { /* proc wasn't found - let's add it */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:base:full:modex no pidmap entry for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name))); pmap = OBJ_NEW(orte_pmap_t); pmap->node = nid->index; pmap->local_rank = local_rank; pmap->node_rank = node_rank; /* this can be done at the specific site corresponding * to the proc's vpid */ opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap); /* account for the proc entry in the jmap */ jmap->num_procs++; } } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:base:full:modex: adding modex entry for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc_name))); /* UPDATE THE MODEX INFO FOR THIS PROC */ if (modex_db) { /* update the modex database */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, &rbuf))) { ORTE_ERROR_LOG(rc); goto cleanup; } } else { /* unpack the number of entries for this proc */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_recvd_entries, &cnt, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:base:full:modex adding %d entries for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_recvd_entries, ORTE_NAME_PRINT(&proc_name))); /* * Extract the attribute names and values */ for (j = 0; j < num_recvd_entries; j++) { size_t num_bytes; orte_attr_t *attr; attr = OBJ_NEW(orte_attr_t); cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &(attr->name), &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_bytes, &cnt, OPAL_SIZE))) { ORTE_ERROR_LOG(rc); goto cleanup; } attr->size = num_bytes; if (num_bytes != 0) { if (NULL == (attr->bytes = (uint8_t *) malloc(num_bytes))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); rc = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } cnt = (orte_std_cntr_t) num_bytes; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, attr->bytes, &cnt, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto cleanup; } } /* add this to the node's attribute list */ opal_list_append(&nid->attrs, &attr->super); } } } cleanup: OBJ_DESTRUCT(&buf); OBJ_DESTRUCT(&rbuf); return rc; }
static int modex(opal_list_t *procs) { int rc; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:bad: modex entered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == procs) { /* The modex will be realized in the background by the daemons. The processes will * only be informed when all data has been collected from all processes. The get_attr * will realize the blocking, it will not return until the data has been rteceived. */ opal_buffer_t *buf, *rbuf; orte_grpcomm_coll_t coll_type = ORTE_GRPCOMM_ALLGATHER; bool modex_reqd = true; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:bad:peer:modex: performing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the buffer that will actually be sent */ buf = OBJ_NEW(opal_buffer_t); rbuf = OBJ_NEW(opal_buffer_t); /* tell the daemon we are doing an allgather */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &coll_type, 1, ORTE_GRPCOMM_COLL_T))) { ORTE_ERROR_LOG(rc); return rc; } /* put our process name in the buffer so it can be unpacked later */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the entries we have received */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(buf, &modex_reqd))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:bad:peer:modex: executing non-blocking allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* send to local daemon */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return rc; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:bad allgather buffer sent", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* now receive the final result. Be sure to do this in * a manner that allows us to return without being in a recv! */ allgather_complete = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_NON_PERSISTENT, allgather_recv_modex, (void*)rbuf); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } rbuf = NULL; /* make sure we don't release it yet */ OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:bad: modex posted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); cleanup: if( NULL != buf ) { OBJ_RELEASE(buf); } if( NULL != rbuf ) { OBJ_RELEASE(rbuf); } return rc; } else { if (ORTE_SUCCESS != (rc = orte_grpcomm_base_full_modex(procs, true))) { ORTE_ERROR_LOG(rc); } } OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:bad: modex completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return rc; }
/*************** MODEX SECTION **************/ void orte_grpcomm_base_modex(int fd, short args, void *cbdata) { orte_grpcomm_caddy_t *caddy = (orte_grpcomm_caddy_t*)cbdata; orte_grpcomm_collective_t *modex = caddy->op; int rc; orte_namelist_t *nm; opal_list_item_t *item; bool found; orte_grpcomm_collective_t *cptr; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:modex: performing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (0 == opal_list_get_size(&modex->participants)) { /* record the collective */ modex->next_cbdata = modex; opal_list_append(&orte_grpcomm_base.active_colls, &modex->super); /* put our process name in the buffer so it can be unpacked later */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* add a wildcard name to the participants so the daemon knows * the jobid that is involved in this collective */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = ORTE_PROC_MY_NAME->jobid; nm->name.vpid = ORTE_VPID_WILDCARD; opal_list_append(&modex->participants, &nm->super); modex->next_cb = orte_grpcomm_base_store_modex; } else { /* see if the collective is already present - a race condition * exists where other participants may have already sent us their * contribution. This would place the collective on the global * array, but leave it marked as "inactive" until we call * modex with the list of participants */ found = false; for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); item != opal_list_get_end(&orte_grpcomm_base.active_colls); item = opal_list_get_next(item)) { cptr = (orte_grpcomm_collective_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s CHECKING COLL id %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cptr->id)); if (modex->id == cptr->id) { found = true; /* remove the old entry - we will replace it * with the modex one */ opal_list_remove_item(&orte_grpcomm_base.active_colls, item); break; } } if (found) { /* since it already exists, the list of * targets contains the list of procs * that have already sent us their info. Cycle * thru the targets and move those entries to * the modex object */ while (NULL != (item = opal_list_remove_first(&cptr->targets))) { opal_list_append(&modex->targets, item); } /* copy the previously-saved data across */ opal_dss.copy_payload(&modex->local_bucket, &cptr->local_bucket); /* cleanup */ OBJ_RELEASE(cptr); } /* now add the modex to the global list of active collectives */ modex->next_cb = orte_grpcomm_base_store_peer_modex; modex->next_cbdata = modex; opal_list_append(&orte_grpcomm_base.active_colls, &modex->super); /* this is not amongst our peers, but rather between a select * group of processes - e.g., during a connect/accept operation. * Thus, this requires we send additional info */ /* pack the collective id */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &modex->id, 1, ORTE_GRPCOMM_COLL_ID_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our name */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our hostname */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.nodename, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our daemon's vpid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our node rank */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.my_node_rank, 1, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our local rank */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.my_local_rank, 1, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); goto cleanup; } #if OPAL_HAVE_HWLOC /* pack our binding info so other procs can determine our locality */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &orte_process_info.cpuset, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto cleanup; } #endif } /* pack the entries we have received */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&modex->buffer))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:full:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* execute the allgather */ if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(modex))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:modex: modex posted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; cleanup: return; }