static int orte_grpcomm_bad_get_proc_attr(const orte_process_name_t proc, const char * attribute_name, void **val, size_t *size) { if( false == allgather_complete ) { ORTE_PROGRESSED_WAIT(allgather_complete, 0, 1); } return orte_grpcomm_base_get_proc_attr(proc, attribute_name, val, size); }
static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) { int rc; opal_buffer_t coll; orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_ALLGATHER; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:bad entering allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* everyone sends data to their local daemon */ OBJ_CONSTRUCT(&coll, opal_buffer_t); /* tell the daemon we are doing an allgather */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&coll, &coll_type, 1, ORTE_GRPCOMM_COLL_T))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&coll); return rc; } /* add our data to it */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&coll, sbuf))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&coll); return rc; } /* send to local daemon */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &coll, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&coll); return rc; } OBJ_DESTRUCT(&coll); OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:bad allgather buffer sent", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* now receive the final result. Be sure to do this in * a manner that allows us to return without being in a recv! */ allgather_complete = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_NON_PERSISTENT, allgather_recv, rbuf); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } ORTE_PROGRESSED_WAIT(allgather_complete, 0, 1); OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:bad allgather completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
static int tool_close(const orte_process_name_t* src_name, orte_iof_tag_t src_tag) { /* if we are a tool, then we need to request the HNP to stop * forwarding data from this process/stream */ opal_buffer_t *buf; orte_iof_tag_t tag; orte_process_name_t hnp; int rc; OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s closing output for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(src_name))); buf = OBJ_NEW(opal_buffer_t); /* setup the tag to stop the copy */ tag = src_tag | ORTE_IOF_CLOSE; /* pack the tag - we do this first so that flow control messages can * consist solely of the tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc; } /* pack the name of the source */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, src_name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc; } /* flag that the close is incomplete */ mca_iof_tool_component.closed = false; /* send the buffer to the correct HNP */ ORTE_HNP_NAME_FROM_JOB(&hnp, src_name->jobid); orte_rml.send_buffer_nb(&hnp, buf, ORTE_RML_TAG_IOF_HNP, 0, send_cb, NULL); /* wait right here until the close is confirmed */ ORTE_PROGRESSED_WAIT(mca_iof_tool_component.closed, 0, 1); return ORTE_SUCCESS; }
static int barrier(void) { opal_buffer_t buf; orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_BARRIER; int rc; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:bad entering barrier", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* everyone sends barrier to local daemon */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* tell the daemon we are doing a barrier */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &coll_type, 1, ORTE_GRPCOMM_COLL_T))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return rc; } /* send to local daemon */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return rc; } OBJ_DESTRUCT(&buf); OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:bad barrier sent", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* now receive the release. Be sure to do this in * a manner that allows us to return without being in a recv! */ barrier_recvd = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_BARRIER, ORTE_RML_NON_PERSISTENT, barrier_recv, NULL); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } ORTE_PROGRESSED_WAIT(barrier_recvd, 0, 1); OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:bad received barrier release", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
int orte_routed_base_register_sync(bool setup) { opal_buffer_t buffer; int rc; orte_daemon_cmd_flag_t command=ORTE_DAEMON_SYNC_BY_PROC; char *rml_uri; OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, "%s registering sync to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON))); /* we need to get the oob to establish * the connection - the oob will leave the connection "alive" * thereafter so we can communicate readily */ OBJ_CONSTRUCT(&buffer, opal_buffer_t); /* if we are setting up, tell the daemon to send back a nidmap */ if (setup) { command = ORTE_DAEMON_SYNC_WANT_NIDMAP; } /* tell the daemon to sync */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buffer); return rc; } /* add our contact info to the buffer so the daemon can explicitly * store it */ rml_uri = orte_rml.get_contact_info(); if (ORTE_SUCCESS != (rc = opal_dss.pack(&buffer, &rml_uri, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buffer); free(rml_uri); return rc; } if (NULL != rml_uri) free(rml_uri); /* send the sync command to our daemon */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &buffer, ORTE_RML_TAG_DAEMON, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buffer); return rc; } OBJ_DESTRUCT(&buffer); /* get the ack - need this to ensure that the sync communication * gets serviced by the event library on the orted prior to the * process exiting */ OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, "%s registering sync waiting for ack", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); sync_recvd = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SYNC, ORTE_RML_NON_PERSISTENT, report_sync, NULL); if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(rc); return rc; } ORTE_PROGRESSED_WAIT(sync_recvd, 0, 1); OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, "%s registering sync ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) { /* the linear module routes all proc communications through * the local daemon. Daemons must identify which of their * daemon-peers is "hosting" the specified recipient and * route the message to that daemon. Daemon contact info * is handled elsewhere, so all we need to do here is * ensure that the procs are told to route through their * local daemon, and that daemons are told how to route * for each proc */ int rc; /* if I am a tool, then I stand alone - there is nothing to do */ if (ORTE_PROC_IS_TOOL) { return ORTE_SUCCESS; } /* if I am a daemon or HNP, then I have to extract the routing info for this job * from the data sent to me for launch and update the routing tables to * point at the daemon for each proc */ if (ORTE_PROC_IS_DAEMON) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear: init routes for daemon job %s\n\thnp_uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); if (NULL == ndat) { /* indicates this is being called during orte_init. * Get the HNP's name for possible later use */ if (NULL == orte_process_info.my_hnp_uri) { /* fatal error */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* set the contact info into the hash table */ if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_hnp_uri))) { ORTE_ERROR_LOG(rc); return(rc); } /* extract the hnp name and store it */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* set our lifeline to the the HNP - we will abort if that connection is lost */ lifeline = ORTE_PROC_MY_HNP; /* daemons will send their contact info back to the HNP as * part of the message confirming they are read to go. HNP's * load their contact info during orte_init */ } else { /* ndat != NULL means we are getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); } return rc; } OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, "%s routed_linear: completed init routes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } if (ORTE_PROC_IS_HNP) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear: init routes for HNP job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job))); if (NULL == ndat) { /* if ndat is NULL, then this is being called during init, so just * make myself available to catch any reported contact info */ if (ORTE_SUCCESS != (rc = orte_routed_base_comm_start())) { ORTE_ERROR_LOG(rc); return rc; } /* the HNP has no lifeline */ lifeline = NULL; } else { /* if this is for my own jobid, then I am getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_PROC_MY_NAME->jobid == job) { if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); return rc; } } else { /* if not, then I need to process the callback */ if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) { ORTE_ERROR_LOG(rc); return rc; } } } return ORTE_SUCCESS; } { /* MUST BE A PROC */ /* if ndat != NULL, then this is being invoked by the proc to * init a route to a specified process that is outside of our * job family. We want that route to go through our HNP, routed via * out local daemon - however, we cannot know for * certain that the HNP already knows how to talk to the specified * procs. For example, in OMPI's publish/subscribe procedures, the * DPM framework looks for an mca param containing the global ompi-server's * uri. This info will come here so the proc can setup a route to * the server - we need to pass the routing info to our HNP */ if (NULL != ndat) { int rc; OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear: init routes w/non-NULL data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if this is for a job family of zero, then we know that the enclosed * procs are local slaves to our daemon. In that case, we can just ignore this * as our daemon - given that it had to spawn the local slave - already * knows how to talk to them */ if (0 == ORTE_JOB_FAMILY(job)) { return ORTE_SUCCESS; } if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) { /* if this is for a different job family, then we route via our HNP * to minimize connection counts to entities such as ompi-server, so * start by sending the contact info to the HNP for update */ OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear_init_routes: diff job family - sending update to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, ndat, ORTE_RML_TAG_RML_INFO_UPDATE, 0))) { ORTE_ERROR_LOG(rc); return rc; } /* wait right here until the HNP acks the update to ensure that * any subsequent messaging can succeed */ ack_recvd = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); ORTE_PROGRESSED_WAIT(ack_recvd, 0, 1); OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear_init_routes: ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* our get_route function automatically routes all messages for * other job families via the HNP, so nothing more to do here */ } return ORTE_SUCCESS; } /* if ndat=NULL, then we are being called during orte_init. In this * case, we need to setup a few critical pieces of info */ OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri, (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri)); if (NULL == orte_process_info.my_daemon_uri) { /* in this module, we absolutely MUST have this information - if * we didn't get it, then error out */ opal_output(0, "%s ERROR: Failed to identify the local daemon's URI", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: This is a fatal condition when the linear router", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: has been selected - either select the unity router", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: or ensure that the local daemon info is provided", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERR_FATAL; } /* we have to set the HNP's name, even though we won't route messages directly * to it. This is required to ensure that we -do- send messages to the correct * HNP name */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* Set the contact info in the RML - this won't actually establish * the connection, but just tells the RML how to reach the daemon * if/when we attempt to send to it */ if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(orte_process_info.my_daemon_uri))) { ORTE_ERROR_LOG(rc); return(rc); } /* extract the daemon's name so we can update the routing table */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, ORTE_PROC_MY_DAEMON, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* set our lifeline to the local daemon - we will abort if this connection is lost */ lifeline = ORTE_PROC_MY_DAEMON; /* register ourselves -this sends a message to the daemon (warming up that connection) * and sends our contact info to the HNP when all local procs have reported * * NOTE: it may seem odd that we send our contact info to the HNP - after all, * the HNP doesn't really need to know how to talk to us directly if we are * using this routing method. However, this is good for two reasons: * * (1) some debuggers and/or tools may need RML contact * info to set themselves up * * (2) doing so allows the HNP to "block" in a dynamic launch * until all procs are reported running, thus ensuring that no communication * is attempted until the overall ORTE system knows how to talk to everyone - * otherwise, the system can just hang. */ if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(true))) { ORTE_ERROR_LOG(rc); return rc; } /* no answer is expected or coming */ return ORTE_SUCCESS; } }
/** * Initialization of data structures for running under a debugger * using the MPICH/TotalView parallel debugger interface. This stage * of initialization must occur after spawn * * NOTE: We -always- perform this step to ensure that any debugger * that attaches to us post-launch of the application can get a * completed proctable */ void orte_debugger_init_after_spawn(orte_job_t *jdata) { orte_proc_t *proc; orte_app_context_t *appctx; orte_vpid_t i, j; opal_buffer_t buf; orte_process_name_t rank0; int rc; if (MPIR_proctable) { /* already initialized */ return; } /* fill in the proc table for the application processes */ if (orte_debug_flag) { opal_output(0, "Info: Setting up debugger process table for applications\n"); } MPIR_debug_state = 1; /* set the total number of processes in the job */ MPIR_proctable_size = jdata->num_procs; /* allocate MPIR_proctable */ MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) * MPIR_proctable_size); if (MPIR_proctable == NULL) { opal_output(0, "Error: Out of memory\n"); return; } /* initialize MPIR_proctable */ for (j=0; j < jdata->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { continue; } /* store this data in the location whose index * corresponds to the proc's rank */ i = proc->name.vpid; if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { continue; } MPIR_proctable[i].host_name = strdup(proc->node->name); if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->app, NULL ); } else { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->cwd, appctx->app, NULL ); } MPIR_proctable[i].pid = proc->pid; } if (orte_debug_flag) { dump(); } /* if we are being launched under a debugger, then we must wait * for it to be ready to go and do some things to start the job */ if (MPIR_being_debugged) { /* wait for all procs to have reported their contact info - this * ensures that (a) they are all into mpi_init, and (b) the system * has the contact info to successfully send a message to rank=0 */ ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs); (void) MPIR_Breakpoint(); /* send a message to rank=0 to release it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */ rank0.jobid = jdata->jobid; rank0.vpid = 0; if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) { opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); } OBJ_DESTRUCT(&buf); } }
static int allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) { int rc; orte_daemon_cmd_flag_t command=ORTE_DAEMON_COLL_CMD; struct timeval ompistart, ompistop; opal_buffer_t coll; orte_grpcomm_coll_t coll_type=ORTE_GRPCOMM_ALLGATHER; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:basic entering allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (orte_timing && ORTE_PROC_MY_NAME->vpid == 0) { gettimeofday(&ompistart, NULL); } /* everyone sends data to their local daemon */ OBJ_CONSTRUCT(&coll, opal_buffer_t); /* tell the daemon to collect the data */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&coll, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&coll); return rc; } /* tell the daemon we are doing an allgather */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&coll, &coll_type, 1, ORTE_GRPCOMM_COLL_T))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&coll); return rc; } /* add our data to it */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&coll, sbuf))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&coll); return rc; } /* send to local daemon */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_DAEMON, &coll, ORTE_RML_TAG_DAEMON, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&coll); return rc; } OBJ_DESTRUCT(&coll); OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s grpcomm:basic allgather buffer sent", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the buffer that will recv the results */ allgather_buf = OBJ_NEW(opal_buffer_t); /* now receive the final result. Be sure to do this in * a manner that allows us to return without being in a recv! */ allgather_complete = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_NON_PERSISTENT, allgather_recv, NULL); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } ORTE_PROGRESSED_WAIT(allgather_complete, 0, 1); /* copy payload to the caller's buffer */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(allgather_buf); return rc; } OBJ_RELEASE(allgather_buf); OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s allgather buffer received", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (orte_timing) { if (ORTE_PROC_MY_NAME->vpid == 0) { /* setup a receive to hear when the rank=N proc has received the data * release - in most xcast schemes, this will always be the final recvr */ barrier_timer = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE_TIMER, ORTE_RML_NON_PERSISTENT, barrier_timer_recv, NULL); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } ORTE_PROGRESSED_WAIT(barrier_timer, 0, 1); gettimeofday(&ompistop, NULL); opal_output(0, "%s allgather: time to complete %ld usec", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); } else if (ORTE_PROC_MY_NAME->vpid == orte_process_info.num_procs-1) { /* if we are rank=N, send a message back to indicate * the xcast completed for timing purposes */ orte_process_name_t name; opal_buffer_t buf; name.jobid = ORTE_PROC_MY_NAME->jobid; name.vpid = 0; OBJ_CONSTRUCT(&buf, opal_buffer_t); if (0 > (rc = orte_rml.send_buffer(&name,&buf,ORTE_RML_TAG_COLLECTIVE_TIMER,0))) { ORTE_ERROR_LOG(rc); return rc; } rc = ORTE_SUCCESS; OBJ_DESTRUCT(&buf); } } OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm:basic allgather completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
int orte_util_comm_query_proc_info(const orte_process_name_t *hnp, orte_jobid_t job, orte_vpid_t vpid, int *num_procs, orte_proc_t ***proc_info_array) #endif { int ret; int32_t cnt, cnt_procs, n; opal_buffer_t *cmd; orte_daemon_cmd_flag_t command = ORTE_DAEMON_REPORT_PROC_INFO_CMD; orte_proc_t **proc_info; /* set default response */ *num_procs = 0; *proc_info_array = NULL; /* query the HNP for info on the procs in this job */ cmd = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } #if ORTE_ENABLE_EPOCH if (ORTE_SUCCESS != (ret = opal_dss.pack(cmd, &epoch, 1, ORTE_EPOCH))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } #endif /* define a max time to wait for send to complete */ timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); /* do the send */ if (0 > (ret = orte_rml.send_buffer_nb((orte_process_name_t*)hnp, cmd, ORTE_RML_TAG_DAEMON, 0, send_cbfunc, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(cmd); return ret; } /* wait for send to complete */ ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); /* release the buffer */ OBJ_RELEASE(cmd); /* did it succeed? */ if (ORTE_SUCCESS != error_exit) { return error_exit; } /* define a max time to wait for an answer */ timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 10, 1000, 10000, quicktime_cb); /* get the answer */ OBJ_CONSTRUCT(&answer, opal_buffer_t); if (ORTE_SUCCESS != (ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL, ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { /* cancel the timer */ if (NULL != quicktime) { opal_event_evtimer_del(quicktime); free(quicktime); quicktime = NULL; } ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); if (ORTE_SUCCESS != error_exit) { OBJ_DESTRUCT(&answer); return error_exit; } cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &cnt_procs, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); return ret; } /* allocate the required memory */ if (0 < cnt_procs) { proc_info = (orte_proc_t**)malloc(cnt_procs * sizeof(orte_proc_t*)); /* unpack the procs */ for (n=0; n < cnt_procs; n++) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&answer, &proc_info[n], &cnt, ORTE_PROC))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&answer); free(proc_info); return ret; } } *proc_info_array = proc_info; *num_procs = (int)cnt_procs; } OBJ_DESTRUCT(&answer); return ORTE_SUCCESS; }
/* report an event to a connected tool */ int orte_util_comm_report_event(orte_comm_event_t ev) { int rc, i; opal_buffer_t buf; orte_node_t *node; /* if nothing is connected, ignore this */ if (!tool_connected) { return ORTE_SUCCESS; } /* init a buffer for the data */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* flag the type of event */ opal_dss.pack(&buf, &ev, 1, ORTE_COMM_EVENT); switch (ev) { case ORTE_COMM_EVENT_ALLOCATE: /* loop through nodes, storing just node names */ for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } opal_dss.pack(&buf, &node->name, 1, OPAL_STRING); } break; case ORTE_COMM_EVENT_MAP: break; case ORTE_COMM_EVENT_LAUNCH: break; default: ORTE_ERROR_LOG(ORTE_ERROR); OBJ_DESTRUCT(&buf); return ORTE_ERROR; break; } /* do the send */ if (0 > (rc = orte_rml.send_buffer(&tool, &buf, ORTE_RML_TAG_TOOL, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return rc; } if (step) { /* the caller wants to wait until an ack is received - * define a max time to wait for an answer */ OBJ_CONSTRUCT(&answer, opal_buffer_t); timer_fired = false; error_exit = ORTE_SUCCESS; ORTE_DETECT_TIMEOUT(&quicktime, 100, 1000, 100000, quicktime_cb); /* get the answer */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_TOOL, ORTE_RML_NON_PERSISTENT, recv_info, NULL))) { /* cancel the timer */ if (NULL != quicktime) { opal_event_evtimer_del(quicktime); free(quicktime); quicktime = NULL; } ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&answer); return rc; } ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); /* cleanup */ OBJ_DESTRUCT(&answer); if (ORTE_SUCCESS != error_exit) { return error_exit; } } return ORTE_SUCCESS; }
int main(int argc, char *argv[]){ int count; int msgsize; uint8_t *msg; int i, j, rc; orte_process_name_t peer; double maxpower; /* * Init */ orte_init(&argc, &argv, ORTE_PROC_NON_MPI); if (argc > 1) { count = atoi(argv[1]); if (count < 0) { count = INT_MAX-1; } } else { count = MAX_COUNT; } peer.jobid = ORTE_PROC_MY_NAME->jobid; for (j=1; j < count+1; j++) { peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs; ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); /* rank0 starts ring */ if (ORTE_PROC_MY_NAME->vpid == 0) { /* setup the initiating buffer - put random sized message in it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); maxpower = (double)(j%7); msgsize = (int)pow(10.0, maxpower); opal_output(0, "Ring %d message size %d bytes", j, msgsize); msg = (uint8_t*)malloc(msgsize); opal_dss.pack(&buf, msg, msgsize, OPAL_BYTE); if (0 > (rc = orte_rml.send_buffer(&peer,&buf, MY_TAG, 0))) { opal_output(0, "error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc)); exit(1); } OBJ_DESTRUCT(&buf); /* wait for it to come around */ OBJ_CONSTRUCT(&buf, opal_buffer_t); msg_recvd = false; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1); opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); } else { /* wait for msg */ OBJ_CONSTRUCT(&buf, opal_buffer_t); msg_recvd = false; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1); /* send it along */ if (0 > (rc = orte_rml.send_buffer(&peer, &buf, MY_TAG, 0))) { opal_output(0, "%s error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc)); exit(1); } OBJ_DESTRUCT(&buf); } } orte_finalize(); return 0; }
/* For a complete description of this algorithm, please look at * ompi/mca/coll/tuned/coll_tuned_allgather.c */ static int recursivedoubling(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, orte_jobid_t jobid, orte_vpid_t np, orte_vpid_t *vpids) { orte_vpid_t rank, distance, nv; int32_t num_remote, total_entries, cnt; opal_buffer_t collection, buf; orte_process_name_t peer; int rc; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:recdub algo employed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* initialize */ total_entries = num_entries; /* start by seeding the collection with our own data */ OBJ_CONSTRUCT(&collection, opal_buffer_t); opal_dss.copy_payload(&collection, sendbuf); /* collective is constrained to take place within the specified jobid */ peer.jobid = jobid; /* Communication step: At every step i, rank r: - exchanges message containing all data collected so far with rank peer = (r ^ 2^i). */ /* find my position in the group of participants. This * value is the "rank" we will use in the algo */ rank = ORTE_VPID_INVALID; for (nv=0; nv < np; nv++) { if (vpids[nv] == ORTE_PROC_MY_NAME->vpid) { rank = nv; break; } } /* check for bozo case */ if (ORTE_VPID_INVALID == rank) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } for (distance = 0x1; distance < np; distance<<=1) { /* first send my current contents */ nv = rank ^ distance; peer.vpid = vpids[nv]; ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &total_entries, 1, OPAL_INT32); opal_dss.copy_payload(&buf, &collection); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:recdub sending to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer))); if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_DESTRUCT(&buf); /* now setup to recv from my other partner */ num_recvd = 0; OBJ_CONSTRUCT(&bucket, opal_buffer_t); if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(&peer, ORTE_RML_TAG_DAEMON_COLLECTIVE, ORTE_RML_NON_PERSISTENT, orte_grpcomm_base_coll_recv, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* and wait for it to get here */ ORTE_PROGRESSED_WAIT(false, num_recvd, 1); /* extract the number of entries in the remote buffer */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* add it to our running total */ total_entries += num_remote; /* transfer the data to our collection */ opal_dss.copy_payload(&collection, &bucket); /* cleanup */ OBJ_DESTRUCT(&bucket); } /* output of a collective begins with the total number of entries */ if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &total_entries, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* transfer the collected data */ opal_dss.copy_payload(recvbuf, &collection); /* cleanup */ OBJ_DESTRUCT(&collection); return ORTE_SUCCESS; }
/* * The Two-Proc Algorithm * * One sends to zero, zero waits to recv from one * Zero adds its data to message, sends result back to one */ static int twoproc(opal_buffer_t *sendbuf, opal_buffer_t *recvbuf, int32_t num_entries, orte_jobid_t jobid, orte_vpid_t *vpids) { orte_process_name_t peer; int32_t num_remote, cnt; int rc; opal_buffer_t buf; peer.jobid = jobid; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc algo employed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (vpids[0] == ORTE_PROC_MY_NAME->vpid) { /* I send first */ peer.vpid = vpids[1]; ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); /* setup a temp buffer so I can inform the other side as to the * number of entries in my buffer */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); opal_dss.copy_payload(&buf, sendbuf); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc sending to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer))); if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_DESTRUCT(&buf); /* wait for reply */ num_recvd = 0; OBJ_CONSTRUCT(&bucket, opal_buffer_t); if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLLECTIVE, ORTE_RML_NON_PERSISTENT, orte_grpcomm_base_coll_recv, NULL))) { ORTE_ERROR_LOG(rc); } ORTE_PROGRESSED_WAIT(false, num_recvd, 1); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc got my return message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* if I am not the start, then I recv first */ num_recvd = 0; OBJ_CONSTRUCT(&bucket, opal_buffer_t); if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLLECTIVE, ORTE_RML_NON_PERSISTENT, orte_grpcomm_base_coll_recv, NULL))) { ORTE_ERROR_LOG(rc); } ORTE_PROGRESSED_WAIT(false, num_recvd, 1); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc got my starting message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* send my data back */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &num_entries, 1, OPAL_INT32); opal_dss.copy_payload(&buf, sendbuf); peer.vpid = vpids[0]; ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer)); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:coll:two-proc sending to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer))); if (0 > (rc = orte_rml.send_buffer(&peer, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_DESTRUCT(&buf); } /* extract the number of entries in the remote buffer */ cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&bucket, &num_remote, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* output of a collective begins with the total number of entries */ num_remote += num_entries; if (ORTE_SUCCESS != (rc = opal_dss.pack(recvbuf, &num_remote, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* xfer my data */ opal_dss.copy_payload(recvbuf, sendbuf); /* xfer the recvd data */ opal_dss.copy_payload(recvbuf, &bucket); /* cleanup */ OBJ_DESTRUCT(&bucket); return ORTE_SUCCESS; }
static int daemon_collective(orte_process_name_t *sender, opal_buffer_t *data) { orte_jobid_t jobid; orte_odls_job_t *jobdat; orte_routed_tree_t *child; orte_std_cntr_t n; opal_list_t daemon_tree; opal_list_item_t *item, *next; int32_t num_contributors; opal_buffer_t buf; orte_process_name_t my_parent, proc; orte_vpid_t daemonvpid; int rc; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s odls: daemon collective called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* unpack the jobid using this collective */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } /* lookup the job record for it */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == jobid) { break; } } if (NULL == jobdat) { /* race condition - someone sent us a collective before we could * parse the add_local_procs cmd. Just add the jobdat object * and continue */ jobdat = OBJ_NEW(orte_odls_job_t); jobdat->jobid = jobid; opal_list_append(&orte_local_jobdata, &jobdat->super); } /* it may be possible to get here prior to having actually finished processing our * local launch msg due to the race condition between different nodes and when * they start their individual procs. Hence, we have to first ensure that we * -have- finished processing the launch msg, or else we won't know whether * or not to wait before sending this on */ ORTE_PROGRESSED_WAIT(jobdat->launch_msg_processed, 0, 1); /* unpack the collective type */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobdat->collective_type, &n, ORTE_GRPCOMM_COLL_T))) { ORTE_ERROR_LOG(rc); return rc; } /* unpack the number of contributors in this data bucket */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &num_contributors, &n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } jobdat->num_contributors += num_contributors; /* xfer the data */ opal_dss.copy_payload(&jobdat->collection_bucket, data); /* count the number of participants collected */ jobdat->num_collected++; /* if we haven't already done so, figure out how many participants we * should be expecting */ if (jobdat->num_participating < 0) { if (0 < jobdat->num_local_procs) { /* we have children, so account for our own participation */ jobdat->num_participating = 1; } else { jobdat->num_participating = 0; } /* now see if anyone else will be sending us something */ OBJ_CONSTRUCT(&daemon_tree, opal_list_t); orte_routed.get_routing_tree(&daemon_tree); /* unfortunately, there is no simple way to determine which of our "child" * daemons in the routing tree will be sending us something. All we can do * is brute force a search, though we attempt to keep it as short as possible */ proc.jobid = jobid; proc.vpid = 0; while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) { /* get the daemon that hosts this proc */ daemonvpid = orte_ess.proc_get_daemon(&proc); /* is this daemon one of our children, or at least its contribution * will pass through one of our children */ item = opal_list_get_first(&daemon_tree); while (item != opal_list_get_end(&daemon_tree)) { next = opal_list_get_next(item); child = (orte_routed_tree_t*)item; if (child->vpid == daemonvpid || opal_bitmap_is_set_bit(&child->relatives, daemonvpid)) { /* it does - add to num_participating */ jobdat->num_participating++; /* remove this from the list so we don't double count it */ opal_list_remove_item(&daemon_tree, item); /* done with search */ break; } item = next; } proc.vpid++; } } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: daemon collective for job %s from %s type %ld num_collected %d num_participating %d num_contributors %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid), ORTE_NAME_PRINT(sender), (long)jobdat->collective_type, jobdat->num_collected, jobdat->num_participating, jobdat->num_contributors)); if (jobdat->num_collected == jobdat->num_participating) { /* if I am the HNP, go process the results */ if (ORTE_PROC_IS_HNP) { goto hnp_process; } /* if I am not the HNP, send to my parent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } /* pack the collective type */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->collective_type, 1, ORTE_GRPCOMM_COLL_T))) { ORTE_ERROR_LOG(rc); return rc; } /* pack the number of contributors */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->num_contributors, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return rc; } /* xfer the payload*/ opal_dss.copy_payload(&buf, &jobdat->collection_bucket); /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* send it */ my_parent.jobid = ORTE_PROC_MY_NAME->jobid; my_parent.vpid = orte_routed.get_routing_tree(NULL); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: daemon collective not the HNP - sending to parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&my_parent))); if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_DESTRUCT(&buf); } return ORTE_SUCCESS; hnp_process: OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: daemon collective HNP - xcasting to job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid))); /* setup a buffer to send the results back to the job members */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORTE_GRPCOMM_BARRIER == jobdat->collective_type) { /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* don't need anything in this for a barrier */ if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, ORTE_RML_TAG_BARRIER))) { ORTE_ERROR_LOG(rc); } } else if (ORTE_GRPCOMM_ALLGATHER == jobdat->collective_type) { int32_t numc; /* add the data */ numc = jobdat->num_contributors; if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &numc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, &jobdat->collection_bucket))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* send the buffer */ if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, ORTE_RML_TAG_ALLGATHER))) { ORTE_ERROR_LOG(rc); } } else { /* no other collectives currently supported! */ ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); rc = ORTE_ERR_NOT_IMPLEMENTED; } cleanup: OBJ_DESTRUCT(&buf); return ORTE_SUCCESS; }
static int onesided_barrier(void) { int num_participating; opal_list_t daemon_tree; opal_buffer_t buf; orte_process_name_t my_parent; opal_event_t *quicktime=NULL; struct timeval quicktimeval; int rc; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: onesided barrier called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are not to use the barrier, then just return */ if (!orte_orted_exit_with_barrier) { if (ORTE_PROC_IS_HNP) { /* if we are the HNP, we need to do a little delay to give * the orteds a chance to exit before we leave */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: onesided barrier adding delay timer", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); quicktimeval.tv_sec = 0; quicktimeval.tv_usec = 100; timer_fired = false; ORTE_DETECT_TIMEOUT(&quicktime, orte_process_info.num_procs, 1000, 10000, quicktime_cb); ORTE_PROGRESSED_WAIT(timer_fired, 0, 1); } return ORTE_SUCCESS; } /* initialize things */ num_onesided_barrier_recvd = 0; num_participating = 0; /* figure out how many participants we should be expecting */ OBJ_CONSTRUCT(&daemon_tree, opal_list_t); my_parent.jobid = ORTE_PROC_MY_NAME->jobid; my_parent.vpid = orte_routed.get_routing_tree(&daemon_tree); num_participating = opal_list_get_size(&daemon_tree); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad: onesided barrier num_participating %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_participating)); /* set the recv */ if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER, ORTE_RML_NON_PERSISTENT, onesided_barrier_recv, NULL))) { ORTE_ERROR_LOG(rc); } /* wait to recv them */ ORTE_PROGRESSED_WAIT(false, num_onesided_barrier_recvd, num_participating); /* cancel the recv */ orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ONESIDED_BARRIER); /* if I am the HNP, then we are done */ if (ORTE_PROC_IS_HNP) { return ORTE_SUCCESS; } /* send a zero-byte msg to my parent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output, "%s grpcomm:bad:onsided:barrier not the HNP - sending to parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&my_parent))); if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_ONESIDED_BARRIER, 0))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return rc; } OBJ_DESTRUCT(&buf); return ORTE_SUCCESS; }
int orte_grpcomm_base_allgather_list(opal_list_t *names, opal_buffer_t *sbuf, opal_buffer_t *rbuf) { opal_list_item_t *item; orte_namelist_t *peer, *root; orte_std_cntr_t num_peers; int rc; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm: entering allgather_list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* the first entry on the list is the "root" that collects * all the data - everyone else just sends and gets back * the results */ root = (orte_namelist_t*)opal_list_get_first(names); /*** NON-ROOT ***/ if (OPAL_EQUAL != opal_dss.compare(&root->name, ORTE_PROC_MY_NAME, ORTE_NAME)) { /* everyone but root sends data */ OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s allgather_list: sending my data to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&root->name))); if (0 > orte_rml.send_buffer(&root->name, sbuf, ORTE_RML_TAG_ALLGATHER_LIST, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); return ORTE_ERR_COMM_FAILURE; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s allgather_list: buffer sent", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the buffer that will recv the results */ allgather_buf = OBJ_NEW(opal_buffer_t); /* now receive the final result from rank=0. Be sure to do this in * a manner that allows us to return without being in a recv! */ allgather_num_recvd = 0; allgather_failed = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST, ORTE_RML_NON_PERSISTENT, allgather_client_recv, NULL); if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(rc); return rc; } ORTE_PROGRESSED_WAIT(allgather_failed, allgather_num_recvd, 1); /* if the allgather failed, return an error */ if (allgather_failed) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(allgather_buf); return ORTE_ERR_COMM_FAILURE; } /* copy payload to the caller's buffer */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(allgather_buf); return rc; } OBJ_RELEASE(allgather_buf); OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s allgather_list: buffer received", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } /*** ROOT ***/ /* count how many peers are participating, including myself */ num_peers = (orte_std_cntr_t)opal_list_get_size(names); /* seed the outgoing buffer with the num_procs so it can be unpacked */ if (ORTE_SUCCESS != (rc = opal_dss.pack(rbuf, &num_peers, 1, ORTE_STD_CNTR))) { ORTE_ERROR_LOG(rc); return rc; } /* put my own information into the outgoing buffer */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, sbuf))) { ORTE_ERROR_LOG(rc); return rc; } /* setup the recv conditions */ allgather_failed = false; allgather_num_recvd = 0; /* setup the buffer that will recv the results */ allgather_buf = OBJ_NEW(opal_buffer_t); OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s allgather_list: waiting to recv %ld inputs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)num_peers-1)); /* post the non-blocking recv */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST, ORTE_RML_NON_PERSISTENT, allgather_server_recv, NULL); if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(rc); return rc; } ORTE_PROGRESSED_WAIT(allgather_failed, allgather_num_recvd, num_peers-1); /* cancel the lingering recv */ if (ORTE_SUCCESS != (rc = orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER_LIST))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(allgather_buf); return rc; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output, "%s allgather_list: received all data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* copy the received info to the caller's buffer */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, allgather_buf))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(allgather_buf); return rc; } OBJ_RELEASE(allgather_buf); /* broadcast the results */ allgather_num_sent = 0; for (item = opal_list_get_first(names); item != opal_list_get_end(names); item = opal_list_get_next(item)) { peer = (orte_namelist_t*)item; /* skip myself */ if (OPAL_EQUAL == opal_dss.compare(&root->name, &peer->name, ORTE_NAME)) { continue; } /* transmit the buffer to this process */ if (0 > orte_rml.send_buffer_nb(&peer->name, rbuf, ORTE_RML_TAG_ALLGATHER_LIST, 0, allgather_send_cb, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); return ORTE_ERR_COMM_FAILURE; } } ORTE_PROGRESSED_WAIT(false, allgather_num_sent, num_peers-1); OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output, "%s grpcomm: allgather_list completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }