/* * Create a jobid */ int orte_plm_base_create_jobid(orte_job_t *jdata) { #if 0 int32_t j; /* RHC: WHILE ORTE CAN NOW HANDLE RECYCLING OF JOBID'S, * THE MPI LAYER CANNOT SINCE THERE IS NO WAY TO * UPDATE THE OMPI_PROC_T LIST AND/OR THE BTL'S */ /* see if there is a prior * jobid that has completed and can be re-used. It can * never be 0 as that belongs to the HNP and its daemons */ for (j=1; j < orte_job_data->size; j++) { if (NULL == opal_pointer_array_get_item(orte_job_data, j)) { /* this local jobid is available - reuse it */ jdata->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, j); return ORTE_SUCCESS; } } #endif if (ORTE_JOB_CONTROL_RESTART & jdata->controls) { /* this job is being restarted - do not assign it * a new jobid */ return ORTE_SUCCESS; } if (UINT16_MAX == orte_plm_globals.next_jobid) { /* if we get here, then no local jobids are available */ ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); jdata->jobid = ORTE_JOBID_INVALID; return ORTE_ERR_OUT_OF_RESOURCE; } /* take the next jobid */ jdata->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, orte_plm_globals.next_jobid); orte_plm_globals.next_jobid++; return ORTE_SUCCESS; }
/* update data function */ static void update_data(int fd, short flg, void *arg) { opal_buffer_t *buf; int32_t ret; orte_process_name_t name; /* setup the buffer to send our cmd */ buf = OBJ_NEW(opal_buffer_t); name.jobid = ORTE_CONSTRUCT_LOCAL_JOBID(my_globals.sched, 0); name.vpid = 0; if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); return; } if (ORCM_SUCCESS != (ret = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_PS, NULL, 0, buf, cbfunc, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buf); } }
static int rte_init(void) { int rc, ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; opal_value_t *kv; char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; orte_process_name_t name; /* run the prolog */ if (ORTE_SUCCESS != (rc = orte_ess_base_std_prolog())) { ORTE_ERROR_LOG(rc); return rc; } u32ptr = &u32; u16ptr = &u16; if (NULL != mca_ess_singleton_component.server_uri) { /* we are going to connect to a server HNP */ if (0 == strncmp(mca_ess_singleton_component.server_uri, "file", strlen("file")) || 0 == strncmp(mca_ess_singleton_component.server_uri, "FILE", strlen("FILE"))) { char input[1024], *filename; FILE *fp; /* it is a file - get the filename */ filename = strchr(mca_ess_singleton_component.server_uri, ':'); if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } ++filename; /* space past the : */ if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } /* open the file and extract the uri */ fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } memset(input, 0, 1024); // initialize the array to ensure a NULL termination if (NULL == fgets(input, 1023, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, "singleton", mca_ess_singleton_component.server_uri, "singleton"); return ORTE_ERROR; } fclose(fp); input[strlen(input)-1] = '\0'; /* remove newline */ orte_process_info.my_hnp_uri = strdup(input); } else { orte_process_info.my_hnp_uri = strdup(mca_ess_singleton_component.server_uri); } /* save the daemon uri - we will process it later */ orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri); /* construct our name - we are in their job family, so we know that * much. However, we cannot know how many other singletons and jobs * this HNP is running. Oh well - if someone really wants to use this * option, they can try to figure it out. For now, we'll just assume * we are the only ones */ ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_HNP->jobid, 1); /* obviously, we are vpid=0 for this job */ ORTE_PROC_MY_NAME->vpid = 0; /* for convenience, push the pubsub version of this param into the environ */ opal_setenv (OPAL_MCA_PREFIX"pubsub_orte_server", orte_process_info.my_hnp_uri, true, &environ); } else if (NULL != getenv("SINGULARITY_CONTAINER") || mca_ess_singleton_component.isolated) { /* ensure we use the isolated pmix component */ opal_setenv (OPAL_MCA_PREFIX"pmix", "isolated", true, &environ); } else { /* spawn our very own HNP to support us */ if (ORTE_SUCCESS != (rc = fork_hnp())) { ORTE_ERROR_LOG(rc); return rc; } /* our name was given to us by the HNP */ opal_setenv (OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ); } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_progress_thread_init(NULL); progress_thread_running = true; /* open and setup pmix */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { error = "opening pmix"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) { error = "select pmix"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) { /* we cannot run */ error = "pmix init"; goto error; } /* pmix.init set our process name down in the OPAL layer, * so carry it forward here */ ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; name.jobid = OPAL_PROC_MY_NAME.jobid; name.vpid = ORTE_VPID_WILDCARD; /* get our local rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting local rank"; goto error; } orte_process_info.my_local_rank = u16; /* get our node rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting node rank"; goto error; } orte_process_info.my_node_rank = u16; /* get max procs */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS, &name, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting max procs"; goto error; } orte_process_info.max_procs = u32; /* we are a singleton, so there is only one proc in the job */ orte_process_info.num_procs = 1; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_APPNUM, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.app_num = u32; } else { orte_process_info.app_num = 0; } /* set some other standard values */ orte_process_info.num_local_peers = 0; /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } /* retrieve our topology */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_TOPO, &name, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* load the topology */ if (0 != hwloc_topology_init(&opal_hwloc_topology)) { ret = OPAL_ERROR; free(val); error = "setting topology"; goto error; } if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) { ret = OPAL_ERROR; free(val); hwloc_topology_destroy(opal_hwloc_topology); error = "setting topology"; goto error; } /* since we are loading this from an external source, we have to * explicitly set a flag so hwloc sets things up correctly */ if (0 != hwloc_topology_set_flags(opal_hwloc_topology, (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } /* now load the topology */ if (0 != hwloc_topology_load(opal_hwloc_topology)) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } free(val); } else { /* it wasn't passed down to us, so go get it */ if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "topology discovery"; goto error; } /* push it into the PMIx database in case someone * tries to retrieve it so we avoid an attempt to * get it again */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCAL_TOPO); kv->type = OPAL_STRING; if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) { error = "topology export"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, kv))) { error = "topology store"; goto error; } OBJ_RELEASE(kv); } /* use the std app init to complete the procedure */ if (ORTE_SUCCESS != (rc = orte_ess_base_app_setup(true))) { ORTE_ERROR_LOG(rc); return rc; } /* push our hostname so others can find us, if they need to */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "db store hostname"; goto error; } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
void orte_daemon_recv(int status, orte_process_name_t* sender, opal_buffer_t *buffer, orte_rml_tag_t tag, void* cbdata) { orte_daemon_cmd_flag_t command; opal_buffer_t *relay_msg; int ret; orte_std_cntr_t n; int32_t signal; orte_jobid_t job; orte_rml_tag_t target_tag; char *contact_info; opal_buffer_t *answer; orte_rml_cmd_flag_t rml_cmd; orte_job_t *jdata; orte_process_name_t proc, proc2; orte_process_name_t *return_addr; int32_t i, num_replies; bool hnp_accounted_for; opal_pointer_array_t procarray; orte_proc_t *proct; char *cmd_str = NULL; opal_pointer_array_t *procs_to_kill = NULL; orte_std_cntr_t num_procs, num_new_procs = 0, p; orte_proc_t *cur_proc = NULL, *prev_proc = NULL; bool found = false; orte_node_t *node; orte_grpcomm_signature_t *sig; /* unpack the command */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &n, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); return; } cmd_str = get_orted_comm_cmd_str(command); OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orted:comm:process_commands() Processing Command: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd_str)); free(cmd_str); cmd_str = NULL; /* now process the command locally */ switch(command) { /**** NULL ****/ case ORTE_DAEMON_NULL_CMD: ret = ORTE_SUCCESS; break; /**** KILL_LOCAL_PROCS ****/ case ORTE_DAEMON_KILL_LOCAL_PROCS: num_replies = 0; /* construct the pointer array */ OBJ_CONSTRUCT(&procarray, opal_pointer_array_t); opal_pointer_array_init(&procarray, num_replies, ORTE_GLOBAL_ARRAY_MAX_SIZE, 16); /* unpack the proc names into the array */ while (ORTE_SUCCESS == (ret = opal_dss.unpack(buffer, &proc, &n, ORTE_NAME))) { proct = OBJ_NEW(orte_proc_t); proct->name.jobid = proc.jobid; proct->name.vpid = proc.vpid; opal_pointer_array_add(&procarray, proct); num_replies++; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { ORTE_ERROR_LOG(ret); goto KILL_PROC_CLEANUP; } if (0 == num_replies) { /* kill everything */ if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(NULL))) { ORTE_ERROR_LOG(ret); } break; } else { /* kill the procs */ if (ORTE_SUCCESS != (ret = orte_odls.kill_local_procs(&procarray))) { ORTE_ERROR_LOG(ret); } } /* cleanup */ KILL_PROC_CLEANUP: for (i=0; i < procarray.size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(&procarray, i))) { free(proct); } } OBJ_DESTRUCT(&procarray); break; /**** SIGNAL_LOCAL_PROCS ****/ case ORTE_DAEMON_SIGNAL_LOCAL_PROCS: /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* look up job data object */ jdata = orte_get_job_data_object(job); /* get the signal */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &signal, &n, OPAL_INT32))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* Convert SIGTSTP to SIGSTOP so we can suspend a.out */ if (SIGTSTP == signal) { if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: converted SIGTSTP to SIGSTOP before delivering", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } signal = SIGSTOP; if (NULL != jdata) { jdata->state |= ORTE_JOB_STATE_SUSPENDED; } } else if (SIGCONT == signal && NULL != jdata) { jdata->state &= ~ORTE_JOB_STATE_SUSPENDED; } if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received signal_local_procs, delivering signal %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signal); } /* signal them */ if (ORTE_SUCCESS != (ret = orte_odls.signal_local_procs(NULL, signal))) { ORTE_ERROR_LOG(ret); } break; /**** ADD_LOCAL_PROCS ****/ case ORTE_DAEMON_ADD_LOCAL_PROCS: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received add_local_procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* launch the processes */ if (ORTE_SUCCESS != (ret = orte_odls.launch_local_procs(buffer))) { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orted:comm:add_procs failed to launch on error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); } break; case ORTE_DAEMON_ABORT_PROCS_CALLED: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received abort_procs report", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* Number of processes */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_procs, &n, ORTE_STD_CNTR)) ) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* Retrieve list of processes */ procs_to_kill = OBJ_NEW(opal_pointer_array_t); opal_pointer_array_init(procs_to_kill, num_procs, INT32_MAX, 2); /* Keep track of previously terminated, so we don't keep ordering the * same processes to die. */ if( NULL == procs_prev_ordered_to_terminate ) { procs_prev_ordered_to_terminate = OBJ_NEW(opal_pointer_array_t); opal_pointer_array_init(procs_prev_ordered_to_terminate, num_procs+1, INT32_MAX, 8); } num_new_procs = 0; for( i = 0; i < num_procs; ++i) { cur_proc = OBJ_NEW(orte_proc_t); n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &(cur_proc->name), &n, ORTE_NAME)) ) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* See if duplicate */ found = false; for( p = 0; p < procs_prev_ordered_to_terminate->size; ++p) { if( NULL == (prev_proc = (orte_proc_t*)opal_pointer_array_get_item(procs_prev_ordered_to_terminate, p))) { continue; } if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &cur_proc->name, &prev_proc->name) ) { found = true; break; } } OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s orted:comm:abort_procs Application %s requests term. of %s (%2d of %2d) %3s.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), ORTE_NAME_PRINT(&(cur_proc->name)), i, num_procs, (found ? "Dup" : "New") )); /* If not a duplicate, then add to the to_kill list */ if( !found ) { opal_pointer_array_add(procs_to_kill, (void*)cur_proc); OBJ_RETAIN(cur_proc); opal_pointer_array_add(procs_prev_ordered_to_terminate, (void*)cur_proc); num_new_procs++; } } /* * Send the request to termiante */ if( num_new_procs > 0 ) { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s orted:comm:abort_procs Terminating application requested processes (%2d / %2d).", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_new_procs, num_procs)); orte_plm.terminate_procs(procs_to_kill); } else { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s orted:comm:abort_procs No new application processes to terminating from request (%2d / %2d).", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_new_procs, num_procs)); } break; /**** TREE_SPAWN ****/ case ORTE_DAEMON_TREE_SPAWN: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received tree_spawn", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* if the PLM supports remote spawn, pass it all along */ if (NULL != orte_plm.remote_spawn) { if (ORTE_SUCCESS != (ret = orte_plm.remote_spawn(buffer))) { ORTE_ERROR_LOG(ret); } } else { opal_output(0, "%s remote spawn is NULL!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } break; /**** DELIVER A MESSAGE TO THE LOCAL PROCS ****/ case ORTE_DAEMON_MESSAGE_LOCAL_PROCS: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received message_local_procs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* unpack the jobid of the procs that are to receive the message */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* unpack the tag where we are to deliver the message */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &target_tag, &n, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s orted:comm:message_local_procs delivering message to job %s tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (int)target_tag)); relay_msg = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(relay_msg, buffer); /* if job=my_jobid, then this message is for us and not for our children */ if (ORTE_PROC_MY_NAME->jobid == job) { /* if the target tag is our xcast_barrier or rml_update, then we have * to handle the message as a special case. The RML has logic in it * intended to make it easier to use. This special logic mandates that * any message we "send" actually only goes into the queue for later * transmission. Thus, since we are already in a recv when we enter * the "process_commands" function, any attempt to "send" the relay * buffer to ourselves will only be added to the queue - it won't * actually be delivered until *after* we conclude the processing * of the current recv. * * The problem here is that, for messages where we need to relay * them along the orted chain, the rml_update * message contains contact info we may well need in order to do * the relay! So we need to process those messages immediately. * The only way to accomplish that is to (a) detect that the * buffer is intended for those tags, and then (b) process * those buffers here. * */ if (ORTE_RML_TAG_RML_INFO_UPDATE == target_tag) { n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(relay_msg, &rml_cmd, &n, ORTE_RML_CMD))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* initialize the routes to my peers - this will update the number * of daemons in the system (i.e., orte_process_info.num_procs) as * this might have changed */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, relay_msg))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } } else { /* just deliver it to ourselves */ if ((ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay_msg, target_tag, orte_rml_send_callback, NULL)) < 0) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); } } } else { /* must be for our children - deliver the message */ if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(job, relay_msg, target_tag))) { ORTE_ERROR_LOG(ret); } OBJ_RELEASE(relay_msg); } break; /**** EXIT COMMAND ****/ case ORTE_DAEMON_EXIT_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received exit cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* kill the local procs */ orte_odls.kill_local_procs(NULL); /* flag that orteds were ordered to terminate */ orte_orteds_term_ordered = true; /* if all my routes and local children are gone, then terminate ourselves */ if (0 == (ret = orte_routed.num_routes())) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: exit cmd, but proc %s is alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proct->name)); } return; } } /* call our appropriate exit procedure */ if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: exit cmd, %d routes still exist", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ret); } return; break; /**** HALT VM COMMAND ****/ case ORTE_DAEMON_HALT_VM_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received halt_vm cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* kill the local procs */ orte_odls.kill_local_procs(NULL); /* flag that orteds were ordered to terminate */ orte_orteds_term_ordered = true; if (ORTE_PROC_IS_HNP) { /* if all my routes and local children are gone, then terminate ourselves */ if (0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ return; } } /* call our appropriate exit procedure */ if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } } else { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } return; break; /**** HALT DVM COMMAND ****/ case ORTE_DAEMON_HALT_DVM_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received halt_dvm cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* we just need to xcast the HALT_VM cmd out, which will send * it back into us */ answer = OBJ_NEW(opal_buffer_t); command = ORTE_DAEMON_HALT_VM_CMD; opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD); sig = OBJ_NEW(orte_grpcomm_signature_t); sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid; sig->signature[0].vpid = ORTE_VPID_WILDCARD; orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, answer); OBJ_RELEASE(answer); OBJ_RELEASE(sig); return; break; /**** SPAWN JOB COMMAND ****/ case ORTE_DAEMON_SPAWN_JOB_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received spawn job", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } answer = OBJ_NEW(opal_buffer_t); job = ORTE_JOBID_INVALID; /* can only process this if we are the HNP */ if (ORTE_PROC_IS_HNP) { /* unpack the job data */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &jdata, &n, ORTE_JOB))) { ORTE_ERROR_LOG(ret); goto ANSWER_LAUNCH; } /* point the originator to the sender */ jdata->originator = *sender; /* assign a jobid to it */ if (ORTE_SUCCESS != (ret = orte_plm_base_create_jobid(jdata))) { ORTE_ERROR_LOG(ret); goto ANSWER_LAUNCH; } /* store it on the global job data pool */ opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), jdata); /* before we launch it, tell the IOF to forward all output exclusively * to the requestor */ { orte_iof_tag_t ioftag; opal_buffer_t *iofbuf; orte_process_name_t source; ioftag = ORTE_IOF_EXCLUSIVE | ORTE_IOF_STDOUTALL | ORTE_IOF_PULL; iofbuf = OBJ_NEW(opal_buffer_t); /* pack the tag */ if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, &ioftag, 1, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(iofbuf); goto ANSWER_LAUNCH; } /* pack the name of the source */ source.jobid = jdata->jobid; source.vpid = ORTE_VPID_WILDCARD; if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, &source, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(iofbuf); goto ANSWER_LAUNCH; } /* pack the sender as the sink */ if (ORTE_SUCCESS != (ret = opal_dss.pack(iofbuf, sender, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(iofbuf); goto ANSWER_LAUNCH; } /* send the buffer to our IOF */ orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, iofbuf, ORTE_RML_TAG_IOF_HNP, orte_rml_send_callback, NULL); } for (i=1; i < orte_node_pool->size; i++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { node->state = ORTE_NODE_STATE_ADDED; } } /* now launch the job - this will just push it into our state machine */ if (ORTE_SUCCESS != (ret = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(ret); goto ANSWER_LAUNCH; } job = jdata->jobid; } ANSWER_LAUNCH: /* pack the jobid to be returned */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* return response */ if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_CONFIRM_SPAWN, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } break; /**** CONTACT QUERY COMMAND ****/ case ORTE_DAEMON_CONTACT_QUERY_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received contact query", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* send back contact info */ contact_info = orte_rml.get_contact_info(); if (NULL == contact_info) { ORTE_ERROR_LOG(ORTE_ERROR); ret = ORTE_ERROR; goto CLEANUP; } /* setup buffer with answer */ answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &contact_info, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } break; /**** REPORT_JOB_INFO_CMD COMMAND ****/ case ORTE_DAEMON_REPORT_JOB_INFO_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received job info query", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* if we are not the HNP, we can do nothing - report * back 0 procs so the tool won't hang */ if (!ORTE_PROC_IS_HNP) { int32_t zero=0; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } } else { /* if we are the HNP, process the request */ int32_t i, num_jobs; orte_job_t *jobdat; /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* setup return */ answer = OBJ_NEW(opal_buffer_t); /* if they asked for a specific job, then just get that info */ if (ORTE_JOBID_WILDCARD != job) { job = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, job); if (NULL != (jobdat = orte_get_job_data_object(job))) { num_jobs = 1; if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } } else { /* if we get here, then send a zero answer */ num_jobs = 0; if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } } } else { /* since the job array is no longer * left-justified and may have holes, we have * to cnt the number of jobs. Be sure to include the daemon * job - the user can slice that info out if they don't care */ num_jobs = 0; for (i=0; i < orte_job_data->size; i++) { if (NULL != opal_pointer_array_get_item(orte_job_data, i)) { num_jobs++; } } /* pack the number of jobs */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_jobs, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* now pack the data, one at a time */ for (i=0; i < orte_job_data->size; i++) { if (NULL != (jobdat = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &jobdat, 1, ORTE_JOB))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } } } } if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } } break; /**** REPORT_NODE_INFO_CMD COMMAND ****/ case ORTE_DAEMON_REPORT_NODE_INFO_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received node info query", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* if we are not the HNP, we can do nothing - report * back 0 nodes so the tool won't hang */ if (!ORTE_PROC_IS_HNP) { int32_t zero=0; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } } else { /* if we are the HNP, process the request */ int32_t i, num_nodes; orte_node_t *node; char *nid; /* unpack the nodename */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &nid, &n, OPAL_STRING))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* setup return */ answer = OBJ_NEW(opal_buffer_t); num_nodes = 0; /* if they asked for a specific node, then just get that info */ if (NULL != nid) { /* find this node */ for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (0 == strcmp(nid, node->name)) { num_nodes = 1; break; } } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_nodes, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &node, 1, ORTE_NODE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } } else { /* count number of nodes */ for (i=0; i < orte_node_pool->size; i++) { if (NULL != opal_pointer_array_get_item(orte_node_pool, i)) { num_nodes++; } } /* pack the answer */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_nodes, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* pack each node separately */ for (i=0; i < orte_node_pool->size; i++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &node, 1, ORTE_NODE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } } } } /* send the info */ if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } } break; /**** REPORT_PROC_INFO_CMD COMMAND ****/ case ORTE_DAEMON_REPORT_PROC_INFO_CMD: if (orte_debug_daemons_flag) { opal_output(0, "%s orted_cmd: received proc info query", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* if we are not the HNP, we can do nothing - report * back 0 procs so the tool won't hang */ if (!ORTE_PROC_IS_HNP) { int32_t zero=0; answer = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &zero, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } } else { /* if we are the HNP, process the request */ orte_job_t *jdata; orte_proc_t *proc; orte_vpid_t vpid; int32_t i, num_procs; char *nid; /* setup the answer */ answer = OBJ_NEW(opal_buffer_t); /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* look up job data object */ job = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, job); if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto CLEANUP; } /* unpack the vpid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &vpid, &n, ORTE_VPID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* if they asked for a specific proc, then just get that info */ if (ORTE_VPID_WILDCARD != vpid) { /* find this proc */ for (i=0; i < jdata->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { continue; } if (vpid == proc->name.vpid) { num_procs = 1; if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc, 1, ORTE_PROC))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } /* the vpid and nodename for this proc are no longer packed * in the ORTE_PROC packing routines to save space for other * uses, so we have to pack them separately */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc->pid, 1, OPAL_PID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } if (NULL == proc->node) { nid = "UNKNOWN"; } else { nid = proc->node->name; } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &nid, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } break; } } } else { /* count number of procs */ num_procs = 0; for (i=0; i < jdata->procs->size; i++) { if (NULL != opal_pointer_array_get_item(jdata->procs, i)) { num_procs++; } } /* pack the answer */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_procs, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* pack each proc separately */ for (i=0; i < jdata->procs->size; i++) { if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc, 1, ORTE_PROC))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); goto CLEANUP; } /* the vpid and nodename for this proc are no longer packed * in the ORTE_PROC packing routines to save space for other * uses, so we have to pack them separately */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &proc->pid, 1, OPAL_PID))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } if (NULL == proc->node) { nid = "UNKNOWN"; } else { nid = proc->node->name; } if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &nid, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); goto CLEANUP; } } } } /* send the info */ if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_TOOL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } } break; /**** HEARTBEAT COMMAND ****/ case ORTE_DAEMON_HEARTBEAT_CMD: ORTE_ERROR_LOG(ORTE_ERR_NOT_IMPLEMENTED); ret = ORTE_ERR_NOT_IMPLEMENTED; break; /**** TOP COMMAND ****/ case ORTE_DAEMON_TOP_CMD: /* setup the answer */ answer = OBJ_NEW(opal_buffer_t); num_replies = 0; hnp_accounted_for = false; n = 1; return_addr = NULL; while (ORTE_SUCCESS == opal_dss.unpack(buffer, &proc, &n, ORTE_NAME)) { /* the jobid provided will, of course, have the job family of * the requestor. We need to convert that to our own job family */ proc.jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid, proc.jobid); if (ORTE_PROC_IS_HNP) { return_addr = sender; proc2.jobid = ORTE_PROC_MY_NAME->jobid; /* if the request is for a wildcard vpid, then it goes to every * daemon. For scalability, we should probably xcast this some * day - but for now, we just loop */ if (ORTE_VPID_WILDCARD == proc.vpid) { /* loop across all daemons */ for (proc2.vpid=1; proc2.vpid < orte_process_info.num_procs; proc2.vpid++) { /* setup the cmd */ relay_msg = OBJ_NEW(opal_buffer_t); command = ORTE_DAEMON_TOP_CMD; if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, sender, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } /* the callback function will release relay_msg buffer */ if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(relay_msg); ret = ORTE_ERR_COMM_FAILURE; } num_replies++; } /* account for our own reply */ if (!hnp_accounted_for) { hnp_accounted_for = true; num_replies++; } /* now get the data for my own procs */ goto GET_TOP; } else { /* this is for a single proc - see which daemon * this rank is on */ if (ORTE_VPID_INVALID == (proc2.vpid = orte_get_proc_daemon_vpid(&proc))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto SEND_TOP_ANSWER; } /* if the vpid is me, then just handle this myself */ if (proc2.vpid == ORTE_PROC_MY_NAME->vpid) { if (!hnp_accounted_for) { hnp_accounted_for = true; num_replies++; } goto GET_TOP; } /* otherwise, forward the cmd on to the appropriate daemon */ relay_msg = OBJ_NEW(opal_buffer_t); command = ORTE_DAEMON_TOP_CMD; if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, sender, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(relay_msg); goto SEND_TOP_ANSWER; } /* the callback function will release relay_msg buffer */ if (0 > orte_rml.send_buffer_nb(&proc2, relay_msg, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(relay_msg); ret = ORTE_ERR_COMM_FAILURE; } } /* end if HNP */ } else { /* this came from the HNP, but needs to go back to the original * requestor. Unpack the name of that entity first */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc2, &n, ORTE_NAME))) { ORTE_ERROR_LOG(ret); /* in this case, we are helpless - we have no idea who to send an * error message TO! All we can do is return - the tool that sent * this request is going to hang, but there isn't anything we can * do about it */ goto CLEANUP; } return_addr = &proc2; GET_TOP: /* this rank must be local to me, or the HNP wouldn't * have sent it to me - process the request */ if (ORTE_SUCCESS != (ret = orte_odls_base_get_proc_stats(answer, &proc))) { ORTE_ERROR_LOG(ret); goto SEND_TOP_ANSWER; } } } SEND_TOP_ANSWER: /* send the answer back to requester */ if (ORTE_PROC_IS_HNP) { /* if I am the HNP, I need to also provide the number of * replies the caller should recv and the sample time */ time_t mytime; char *cptr; relay_msg = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &num_replies, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); } time(&mytime); cptr = ctime(&mytime); cptr[strlen(cptr)-1] = '\0'; /* remove trailing newline */ if (ORTE_SUCCESS != (ret = opal_dss.pack(relay_msg, &cptr, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } /* copy the stats payload */ opal_dss.copy_payload(relay_msg, answer); OBJ_RELEASE(answer); answer = relay_msg; } /* if we don't have a return address, then we are helpless */ if (NULL == return_addr) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); ret = ORTE_ERR_COMM_FAILURE; break; } if (0 > (ret = orte_rml.send_buffer_nb(return_addr, answer, ORTE_RML_TAG_TOOL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); } CLEANUP: return; }
static int rte_init(void) { int ret; char *error = NULL; char **nodes = NULL, **ppnlist = NULL; char *envar; int32_t jobfam; int i, j, *ppn; orte_nid_t *node; orte_jmap_t *jmap; orte_pmap_t *pmap; orte_vpid_t vpid; bool byslot; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* Only application procs can use this module. Since we * were directly launched by someone, we need to bootstrap * our own global info so we can startup. */ /* ensure that static ports were assigned - otherwise, we cant * work since we won't know how to talk to anyone else */ if (NULL == getenv("OMPI_MCA_oob_tcp_static_ports") && NULL == getenv("OMPI_MCA_oob_tcp_static_ports_v6")) { error = "static ports were not assigned"; goto error; } /* declare ourselves to be standalone - i.e., not launched by orted */ orte_standalone_operation = true; /* extract a jobid from the environment - can be totally * arbitrary. if one isn't provided, just fake it */ if (NULL != (envar = getenv("OMPI_MCA_orte_jobid"))) { jobfam = strtol(envar, NULL, 10); } else { jobfam = 1; } ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(0, jobfam); /* extract a rank from the environment */ if (NULL == (envar = getenv("OMPI_MCA_orte_rank"))) { error = "could not get process rank"; goto error; } ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10); ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN); OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s completed name definition", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the number of procs in this job */ if (NULL == (envar = getenv("OMPI_MCA_orte_num_procs"))) { error = "could not get number of processes in job"; goto error; } orte_process_info.num_procs = strtol(envar, NULL, 10); if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /* set the app_num so that MPI attributes get set correctly */ orte_process_info.app_num = 1; /* get the list of nodes */ if (NULL == (envar = getenv("OMPI_MCA_orte_nodes"))) { error = "could not get list of nodes"; goto error; } /* break this down */ nodes = opal_argv_split(envar, ','); orte_process_info.num_nodes = opal_argv_count(nodes); /* get the ppn */ if (NULL == (envar = getenv("OMPI_MCA_orte_ppn"))) { error = "could not get ppn"; goto error; } ppnlist = opal_argv_split(envar, ','); ppn = (int*)malloc(orte_process_info.num_nodes * sizeof(int)); if (1 == opal_argv_count(ppnlist)) { /* constant ppn */ j = strtol(ppnlist[0], NULL, 10); for (i=0; i < orte_process_info.num_nodes; i++) { ppn[i] = j; } } else { for (i=0; i < orte_process_info.num_nodes; i++) { ppn[i] = strtol(ppnlist[i], NULL, 10); } } opal_argv_free(ppnlist); /* get the mapping mode - default to byslot */ byslot = true; if (NULL != (envar = getenv("OMPI_MCA_mapping")) && 0 == strcmp(envar, "bynode")) { byslot = false; } /* setup the nidmap arrays */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } /* set the size of the nidmap storage so we minimize realloc's */ if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&orte_nidmap, orte_process_info.num_nodes))) { error = "could not set pointer array size for nidmap"; goto error; } /* construct the nidmap */ for (i=0; i < orte_process_info.num_nodes; i++) { node = OBJ_NEW(orte_nid_t); if (0 == strcmp(nodes[i], orte_process_info.nodename) || opal_ifislocal(nodes[i])) { node->name = strdup(orte_process_info.nodename); } else { node->name = strdup(nodes[i]); } node->daemon = i; node->index = i; opal_pointer_array_set_item(&orte_nidmap, i, node); } opal_argv_free(nodes); /* create a job map for this job */ jmap = OBJ_NEW(orte_jmap_t); jmap->job = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_add(&orte_jobmap, jmap); /* update the num procs */ jmap->num_procs = orte_process_info.num_procs; /* set the size of the pidmap storage so we minimize realloc's */ if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) { ORTE_ERROR_LOG(ret); error = "could not set pointer array size for pidmap"; goto error; } /* construct the pidmap */ if (byslot) { vpid = 0; for (i=0; i < orte_process_info.num_nodes; i++) { node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i); /* for each node, cycle through the ppn */ for (j=0; j < ppn[i]; j++) { pmap = OBJ_NEW(orte_pmap_t); pmap->node = i; pmap->local_rank = j; pmap->node_rank = j; if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) { ORTE_ERROR_LOG(ret); error = "could not set pmap values"; goto error; } /* if this is me, then define the daemon's vpid to * be the node number */ if (vpid == ORTE_PROC_MY_NAME->vpid) { ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = i; ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s node %d name %s rank %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int) node->index, node->name, ORTE_VPID_PRINT(vpid))); vpid++; } } } else { /* cycle across the nodes */ vpid = 0; while (vpid < orte_process_info.num_procs) { for (i=0; i < orte_process_info.num_nodes && vpid < orte_process_info.num_procs; i++) { node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i); if (0 < ppn[i]) { pmap = OBJ_NEW(orte_pmap_t); pmap->node = i; pmap->local_rank = ppn[i]-1; pmap->node_rank = ppn[i]-1; if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) { ORTE_ERROR_LOG(ret); error = "could not set pmap values"; goto error; } /* if this is me, then define the daemon's vpid to * be the node number */ if (vpid == ORTE_PROC_MY_NAME->vpid) { ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = i; ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s node %d name %s rank %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int) node->index, node->name, (int)vpid)); vpid++; --ppn[i]; } } } } free(ppn); /* ensure we pick the correct critical components */ putenv("OMPI_MCA_grpcomm=hier"); putenv("OMPI_MCA_routed=direct"); /* use the default procedure to finish my setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } if (0 < opal_output_get_verbosity(orte_ess_base_output)) { orte_nidmap_dump(); orte_jobmap_dump(); } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
static int rte_init(void) { int ret; char *error = NULL; char *tmp=NULL, *tailpiece; orte_jobid_t jobid=ORTE_JOBID_INVALID; orte_vpid_t vpid=ORTE_VPID_INVALID; int32_t jfam; OBJ_CONSTRUCT(&ctl, orte_thread_ctl_t); my_uid = (uint32_t)getuid(); /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* if we were given a jobid, use it */ mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", true, false, NULL, &tmp); if (NULL != tmp) { if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, tmp))) { ORTE_ERROR_LOG(ret); error = "convert_jobid"; goto error; } free(tmp); ORTE_PROC_MY_NAME->jobid = jobid; } else { /* if we were given a job family, use it */ mca_base_param_reg_string_name("orte", "ess_job_family", "Job family", true, false, NULL, &tmp); if (NULL != tmp) { jfam = strtoul(tmp, &tailpiece, 10); if (UINT16_MAX < jfam || NULL != tailpiece) { /* use a string hash to restructure this to fit */ OPAL_HASH_STR(tmp, jfam); } ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jfam << 16, 0); } } /* if we were given a vpid, use it */ mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid", true, false, NULL, &tmp); if (NULL != tmp) { if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, tmp))) { ORTE_ERROR_LOG(ret); error = "convert_vpid"; goto error; } free(tmp); ORTE_PROC_MY_NAME->vpid = vpid; if (vpid < 2) { /* NOT ALLOWED - POTENTIAL CONFLICT WITH ORCM AND ORCM-SCHED */ error = "disallowed_vpid"; ret = ORTE_ERR_BAD_PARAM; goto error; } } /* if both were given, then we are done */ if (ORTE_JOBID_INVALID != ORTE_PROC_MY_NAME->jobid && ORTE_VPID_INVALID != ORTE_PROC_MY_NAME->vpid) { goto complete; } #if HAVE_QINFO_H /* if we have qlib, then we can ask it for info by which we determine our * name based on provided rack location info */ { qinfo_t *qinfo; if (NULL != (qinfo = get_qinfo())) { /* if we were given a jobid, then leave it alone */ if (ORTE_JOBID_INVALID == ORTE_PROC_MY_NAME->jobid) { /* not given - assign it to 0 */ ORTE_PROC_MY_NAME->jobid = 0; } /* must ensure that no daemon gets vpid 0 or 1 */ ORTE_PROC_MY_NAME->vpid = (qinfo->rack * QLIB_MAX_SLOTS_PER_RACK) + qinfo->slot + 2; /* ensure that the HNP uri is NULL */ if (NULL != orte_process_info.my_hnp_uri) { opal_output(0, "%s CONFLICTING NAME RESOLUTION - NO NAME GIVEN, BUT HNP SPECIFIED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); error = "name conflict"; ret = ORTE_ERR_FATAL; goto error; } OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "GOT NAME %s FROM QINFO rack %d slot %d ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), qinfo->rack, qinfo->slot)); goto complete; } } #endif /* we must have been given a vpid - we can get the jobid * in other ways */ if (ORTE_VPID_INVALID == ORTE_PROC_MY_NAME->vpid) { /* we have an error */ error = "missing vpid assignment"; ret = ORTE_ERR_FATAL; goto error; } /* if we were given an HNP, we can get the jobid from * the HNP's name - this is decoded in proc_info.c during * the prolog */ if (ORTE_JOBID_INVALID != ORTE_PROC_MY_HNP->jobid) { ORTE_PROC_MY_NAME->jobid = orte_process_info.my_hnp.jobid; } else { /* just fake it */ ORTE_PROC_MY_NAME->jobid = 0; } complete: if (ORTE_SUCCESS != (ret = local_setup())) { ORTE_ERROR_LOG(ret); error = "local_setup"; goto error; } OBJ_DESTRUCT(&ctl); return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); OBJ_DESTRUCT(&ctl); return ret; }
static int rte_init(void) { int ret, i, j; char *error = NULL, *localj; int32_t jobfam, stepid; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *cs_env, *string_key; char *pmi_id=NULL; int *ranks; char *tmp; orte_jobid_t jobid; orte_process_name_t proc; orte_local_rank_t local_rank; orte_node_rank_t node_rank; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } #if OPAL_HAVE_HWLOC /* get the topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { error = "topology discovery"; goto error; } } #endif if (ORTE_PROC_IS_DAEMON) { /* I am a daemon, launched by mpirun */ /* we had to be given a jobid */ mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", true, false, NULL, &tmp); if (NULL == tmp) { error = "missing jobid"; ret = ORTE_ERR_FATAL; goto error; } if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, tmp))) { ORTE_ERROR_LOG(ret); error = "convert jobid"; goto error; } free(tmp); ORTE_PROC_MY_NAME->jobid = jobid; /* get our rank from PMI */ if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_rank"); error = "could not get PMI rank"; goto error; } ORTE_PROC_MY_NAME->vpid = i + 1; /* compensate for orterun */ /* get the number of procs from PMI */ if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_universe_size"); error = "could not get PMI universe size"; goto error; } orte_process_info.num_procs = i + 1; /* compensate for orterun */ /* complete setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(NULL))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } } else { /* we are a direct-launched MPI process */ /* get our PMI id length */ if (PMI_SUCCESS != (ret = PMI_Get_id_length_max(&pmi_maxlen))) { error = "PMI_Get_id_length_max"; goto error; } pmi_id = malloc(pmi_maxlen); if (PMI_SUCCESS != (ret = PMI_Get_kvs_domain_id(pmi_id, pmi_maxlen))) { free(pmi_id); error = "PMI_Get_kvs_domain_id"; goto error; } /* PMI is very nice to us - the domain id is an integer followed * by a '.', followed by essentially a stepid. The first integer * defines an overall job number. The second integer is the number of * individual jobs we have run within that allocation. So we translate * this as the overall job number equating to our job family, and * the individual number equating to our local jobid */ jobfam = strtol(pmi_id, &localj, 10); if (NULL == localj) { /* hmmm - no '.', so let's just use zero */ stepid = 0; } else { localj++; /* step over the '.' */ stepid = strtol(localj, NULL, 10) + 1; /* add one to avoid looking like a daemon */ } free(pmi_id); /* now build the jobid */ ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid); /* get our rank */ if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_rank"); error = "could not get PMI rank"; goto error; } ORTE_PROC_MY_NAME->vpid = i; /* get the number of procs from PMI */ if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_universe_size"); error = "could not get PMI universe size"; goto error; } orte_process_info.num_procs = i; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ asprintf(&ev1, "OMPI_MCA_orte_ess_num_procs=%d", i); putenv(ev1); asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", i); putenv(ev2); /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ unique_key[0] = (uint64_t)jobfam; unique_key[1] = (uint64_t)stepid; if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&envar, "%s=%s", cs_env, string_key); putenv(envar); /* cannot free the envar as that messes up our environ */ free(cs_env); free(string_key); /* our app_context number can only be 0 as we don't support * dynamic spawns */ orte_process_info.app_num = 0; /* setup my daemon's name - arbitrary, since we don't route * messages */ ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = 0; /* ensure we pick the correct critical components */ putenv("OMPI_MCA_grpcomm=pmi"); putenv("OMPI_MCA_routed=direct"); /* now use the default procedure to finish my setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* store our info into the database */ if (ORTE_SUCCESS != (ret = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_HOSTNAME, orte_process_info.nodename, OPAL_STRING))) { error = "db store daemon vpid"; goto error; } /* get our local proc info to find our local rank */ if (PMI_SUCCESS != (ret = PMI_Get_clique_size(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_clique_size"); error = "could not get PMI clique size"; goto error; } /* store that info - remember, we want the number of peers that * share the node WITH ME, so we have to subtract ourselves from * that number */ orte_process_info.num_local_peers = i - 1; /* now get the specific ranks */ ranks = (int*)malloc(i * sizeof(int)); if (PMI_SUCCESS != (ret = PMI_Get_clique_ranks(ranks, i))) { ORTE_PMI_ERROR(ret, "PMI_Get_clique_ranks"); error = "could not get clique ranks"; goto error; } /* The clique ranks are returned in rank order, so * cycle thru the array and update the local/node * rank info */ proc.jobid = ORTE_PROC_MY_NAME->jobid; for (j=0; j < i; j++) { proc.vpid = ranks[j]; local_rank = j; node_rank = j; if (ranks[j] == (int)ORTE_PROC_MY_NAME->vpid) { orte_process_info.my_local_rank = local_rank; orte_process_info.my_node_rank = node_rank; } if (ORTE_SUCCESS != (ret = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { error = "db store local rank"; goto error; } if (ORTE_SUCCESS != (ret = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) { error = "db store node rank"; goto error; } } free(ranks); /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } } /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /* flag that we completed init */ app_init_complete = true; return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }