static void vm_term(int status, orte_process_name_t *sender, orcm_pnp_tag_t tag, struct iovec *msg, int count, opal_buffer_t *buf, void *cbdata) { int rc, n; uint16_t jfam; opal_buffer_t response; orcm_tool_cmd_t flag=ORCM_TOOL_STOP_CMD; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s GOT TERM COMMAND FROM %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* if this isn't intended for me, ignore it */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buf, &jfam, &n, OPAL_UINT16))) { ORTE_ERROR_LOG(rc); return; } if (jfam != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "%s GOT TERM COMMAND FOR DVM %d - NOT FOR ME!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jfam)); return; } ORTE_TIMER_EVENT(0, 0, orcm_just_quit); }
static void vm_cmd(int status, orte_process_name_t *sender, orcm_pnp_tag_t tag, struct iovec *msg, int count, opal_buffer_t *buffer, void *cbdata) { int rc, n; uint16_t jfam; orte_process_name_t generator; OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "%s GOT COMMAND FROM %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* if this isn't intended for me, ignore it */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jfam, &n, OPAL_UINT16))) { ORTE_ERROR_LOG(rc); return; } if (jfam != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "%s GOT COMMAND FOR DVM %d - NOT FOR ME!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jfam)); return; } ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, buffer, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor); }
static bool route_is_defined(const orte_process_name_t *target) { int i; orte_routed_jobfam_t *jfam; uint16_t jfamily; /* if the route is to a different job family and we are the HNP, look it up */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { if (ORTE_PROC_IS_HNP) { jfamily = ORTE_JOB_FAMILY(target->jobid); for (i=0; i < orte_routed_jobfams.size; i++) { if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) { continue; } if (jfam->job_family == jfamily) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_radix: route to %s is defined", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOB_FAMILY_PRINT(target->jobid))); return true; } } return false; } /* if we are not the HNP, then the answer is always true as * we send it via the HNP */ return true; } /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == orte_get_proc_daemon_vpid((orte_process_name_t*)target)) { return false; } return true; }
void orte_routed_base_update_hnps(opal_buffer_t *buf) { int n, rc; char *uri; orte_process_name_t name; orte_routed_jobfam_t *jfam; uint16_t jobfamily; n = 1; while (ORTE_SUCCESS == opal_dss.unpack(buf, &uri, &n, OPAL_STRING)) { /*extract the name */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(uri, &name, NULL))) { ORTE_ERROR_LOG(rc); free(uri); n=1; continue; } jobfamily = ORTE_JOB_FAMILY(name.jobid); /* see if we already have this connection */ for (n=0; n < orte_routed_jobfams.size; n++) { if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams,n))) { continue; } if (jobfamily == jfam->job_family) { /* update uri */ if (NULL != jfam->hnp_uri) { free(jfam->hnp_uri); } jfam->hnp_uri = strdup(uri); OPAL_OUTPUT_VERBOSE((10, orte_routed_base_framework.framework_output, "%s adding remote HNP %s\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), uri)); goto done; } } /* nope - create it */ jfam = OBJ_NEW(orte_routed_jobfam_t); jfam->job_family = jobfamily; jfam->route.jobid = name.jobid; jfam->route.vpid = name.vpid; jfam->hnp_uri = strdup(uri); done: free(uri); n=1; } }
static void launch_restart(int fd, short args, void *cbdata) { orte_errmgr_caddy_t *cd = (orte_errmgr_caddy_t*)cbdata; int rc; opal_buffer_t *bfr; uint16_t jfam; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s RESTARTING JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(cd->jdata->jobid))); /* reset the job */ orte_plm_base_reset_job(cd->jdata); /* the resilient mapper will automatically avoid restarting the * proc on its former node */ /* map the job again */ if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(cd->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } bfr = OBJ_NEW(opal_buffer_t); /* indicate the target DVM */ jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); opal_dss.pack(bfr, &jfam, 1, OPAL_UINT16); /* get the launch data */ if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(bfr, cd->jdata->jobid))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(bfr); goto cleanup; } /* send it to the daemons */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_COMMAND, NULL, 0, bfr, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } cleanup: OBJ_RELEASE(cd); }
static int _setup_jobfam_session_dir(orte_process_name_t *proc) { int rc = ORTE_SUCCESS; /* construct the top_session_dir if we need */ if (NULL == orte_process_info.jobfam_session_dir) { if (ORTE_SUCCESS != (rc = _setup_top_session_dir())) { return rc; } if (ORTE_PROC_IS_HNP) { if (0 > asprintf(&orte_process_info.jobfam_session_dir, "%s/pid.%lu", orte_process_info.top_session_dir, (unsigned long)orte_process_info.pid) ) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto exit; } } else { /* we were not given one, so define it */ if (NULL == proc || (ORTE_JOBID_INVALID == proc->jobid) ) { if (0 > asprintf(&orte_process_info.jobfam_session_dir, "%s/jobfam", orte_process_info.top_session_dir) ) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto exit; } } else { if (0 > asprintf(&orte_process_info.jobfam_session_dir, "%s/jf.%d", orte_process_info.top_session_dir, ORTE_JOB_FAMILY(proc->jobid))) { orte_process_info.jobfam_session_dir = NULL; rc = ORTE_ERR_OUT_OF_RESOURCE; goto exit; } } } } exit: if( ORTE_SUCCESS != rc ){ ORTE_ERROR_LOG(rc); } return rc; }
static int orte_routed_base_open(mca_base_open_flag_t flags) { orte_routed_jobfam_t *jfam; orte_routed_base_wait_sync = false; /* Initialize storage of remote hnp uris */ OBJ_CONSTRUCT(&orte_routed_jobfams, opal_pointer_array_t); opal_pointer_array_init(&orte_routed_jobfams, 8, INT_MAX, 8); /* prime it with our HNP uri */ jfam = OBJ_NEW(orte_routed_jobfam_t); jfam->route.jobid = ORTE_PROC_MY_HNP->jobid; jfam->route.vpid = ORTE_PROC_MY_HNP->vpid; jfam->job_family = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); if (NULL != orte_process_info.my_hnp_uri) { jfam->hnp_uri = strdup(orte_process_info.my_hnp_uri); } opal_pointer_array_add(&orte_routed_jobfams, jfam); /* Open up all available components */ return mca_base_framework_components_open(&orte_routed_base_framework, flags); }
static char *orte_build_job_session_dir(char *top_dir, orte_process_name_t *proc, orte_jobid_t jobid) { char *jobfam = NULL; char *job_session_dir; if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return NULL; } if (ORTE_JOBID_WILDCARD != jobid) { char *job = NULL; if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); job_session_dir = NULL; goto out; } job_session_dir = opal_os_path(false, top_dir, jobfam, job, NULL); free(job); if (NULL == job_session_dir) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); } } else { job_session_dir = opal_os_path(false, top_dir, jobfam, NULL); if( NULL == job_session_dir) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); } } out: free(jobfam); return job_session_dir; }
static int rte_init(void) { int rc, ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; opal_value_t *kv; char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; orte_process_name_t name; /* run the prolog */ if (ORTE_SUCCESS != (rc = orte_ess_base_std_prolog())) { ORTE_ERROR_LOG(rc); return rc; } u32ptr = &u32; u16ptr = &u16; if (NULL != mca_ess_singleton_component.server_uri) { /* we are going to connect to a server HNP */ if (0 == strncmp(mca_ess_singleton_component.server_uri, "file", strlen("file")) || 0 == strncmp(mca_ess_singleton_component.server_uri, "FILE", strlen("FILE"))) { char input[1024], *filename; FILE *fp; /* it is a file - get the filename */ filename = strchr(mca_ess_singleton_component.server_uri, ':'); if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } ++filename; /* space past the : */ if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } /* open the file and extract the uri */ fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } memset(input, 0, 1024); // initialize the array to ensure a NULL termination if (NULL == fgets(input, 1023, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, "singleton", mca_ess_singleton_component.server_uri, "singleton"); return ORTE_ERROR; } fclose(fp); input[strlen(input)-1] = '\0'; /* remove newline */ orte_process_info.my_hnp_uri = strdup(input); } else { orte_process_info.my_hnp_uri = strdup(mca_ess_singleton_component.server_uri); } /* save the daemon uri - we will process it later */ orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri); /* construct our name - we are in their job family, so we know that * much. However, we cannot know how many other singletons and jobs * this HNP is running. Oh well - if someone really wants to use this * option, they can try to figure it out. For now, we'll just assume * we are the only ones */ ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_HNP->jobid, 1); /* obviously, we are vpid=0 for this job */ ORTE_PROC_MY_NAME->vpid = 0; /* for convenience, push the pubsub version of this param into the environ */ opal_setenv (OPAL_MCA_PREFIX"pubsub_orte_server", orte_process_info.my_hnp_uri, true, &environ); } else if (NULL != getenv("SINGULARITY_CONTAINER") || mca_ess_singleton_component.isolated) { /* ensure we use the isolated pmix component */ opal_setenv (OPAL_MCA_PREFIX"pmix", "isolated", true, &environ); } else { /* spawn our very own HNP to support us */ if (ORTE_SUCCESS != (rc = fork_hnp())) { ORTE_ERROR_LOG(rc); return rc; } /* our name was given to us by the HNP */ opal_setenv (OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ); } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_progress_thread_init(NULL); progress_thread_running = true; /* open and setup pmix */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { error = "opening pmix"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) { error = "select pmix"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) { /* we cannot run */ error = "pmix init"; goto error; } /* pmix.init set our process name down in the OPAL layer, * so carry it forward here */ ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; name.jobid = OPAL_PROC_MY_NAME.jobid; name.vpid = ORTE_VPID_WILDCARD; /* get our local rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting local rank"; goto error; } orte_process_info.my_local_rank = u16; /* get our node rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting node rank"; goto error; } orte_process_info.my_node_rank = u16; /* get max procs */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS, &name, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting max procs"; goto error; } orte_process_info.max_procs = u32; /* we are a singleton, so there is only one proc in the job */ orte_process_info.num_procs = 1; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_APPNUM, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.app_num = u32; } else { orte_process_info.app_num = 0; } /* set some other standard values */ orte_process_info.num_local_peers = 0; /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } /* retrieve our topology */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_TOPO, &name, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* load the topology */ if (0 != hwloc_topology_init(&opal_hwloc_topology)) { ret = OPAL_ERROR; free(val); error = "setting topology"; goto error; } if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) { ret = OPAL_ERROR; free(val); hwloc_topology_destroy(opal_hwloc_topology); error = "setting topology"; goto error; } /* since we are loading this from an external source, we have to * explicitly set a flag so hwloc sets things up correctly */ if (0 != hwloc_topology_set_flags(opal_hwloc_topology, (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } /* now load the topology */ if (0 != hwloc_topology_load(opal_hwloc_topology)) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } free(val); } else { /* it wasn't passed down to us, so go get it */ if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "topology discovery"; goto error; } /* push it into the PMIx database in case someone * tries to retrieve it so we avoid an attempt to * get it again */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCAL_TOPO); kv->type = OPAL_STRING; if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) { error = "topology export"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, kv))) { error = "topology store"; goto error; } OBJ_RELEASE(kv); } /* use the std app init to complete the procedure */ if (ORTE_SUCCESS != (rc = orte_ess_base_app_setup(true))) { ORTE_ERROR_LOG(rc); return rc; } /* push our hostname so others can find us, if they need to */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "db store hostname"; goto error; } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { ret = ORTE_NAME_INVALID; goto found; } /* initialize */ daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; if (ORTE_PROC_IS_APP) { /* if I am an application, AND I have knowledge of * my daemon (i.e., a daemon launched me), then I * always route thru the daemon */ if (NULL != orte_process_info.my_daemon_uri) { ret = ORTE_PROC_MY_DAEMON; } else { /* I was direct launched and do not have * a daemon, so I have to route direct */ ret = target; } goto found; } /* if I am a tool, the route is direct if target is in * my own job family, and to the target's HNP if not */ if (ORTE_PROC_IS_TOOL) { if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { ret = target; goto found; } else { ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid); ret = &daemon; goto found; } } /****** HNP AND DAEMONS ONLY ******/ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routing direct to the HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ret = ORTE_PROC_MY_HNP; goto found; } daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) { ret = ORTE_NAME_INVALID; goto found; } /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; goto found; } /* else route to this daemon directly */ ret = &daemon; found: OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_direct_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(ret))); return *ret; }
int orte_iof_hnp_send_data_to_endpoint(orte_process_name_t *host, orte_process_name_t *target, orte_iof_tag_t tag, unsigned char *data, int numbytes) { opal_buffer_t *buf; int rc; /* if the host is a daemon and we are in the process of aborting, * then ignore this request. We leave it alone if the host is not * a daemon because it might be a tool that wants to watch the * output from an abort procedure */ if (ORTE_JOB_FAMILY(host->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) && orte_job_term_ordered) { return ORTE_SUCCESS; } buf = OBJ_NEW(opal_buffer_t); /* pack the tag - we do this first so that flow control messages can * consist solely of the tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &tag, 1, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc; } /* pack the name of the target - this is either the intended * recipient (if the tag is stdin and we are sending to a daemon), * or the source (if we are sending to anyone else) */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, target, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc; } /* if data is NULL, then we are done */ if (NULL != data) { /* pack the data - if numbytes is zero, we will pack zero bytes */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, data, numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc; } } /* if the target is wildcard, then this needs to go to everyone - xcast it */ if (ORTE_PROC_MY_NAME->jobid == host->jobid && ORTE_VPID_WILDCARD == host->vpid) { /* xcast this to everyone - the local daemons will know how to handle it */ orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, buf, ORTE_RML_TAG_IOF_PROXY); OBJ_RELEASE(buf); return ORTE_SUCCESS; } /* send the buffer to the host - this is either a daemon or * a tool that requested IOF */ if (0 > (rc = orte_rml.send_buffer_nb(host, buf, ORTE_RML_TAG_IOF_PROXY, 0, send_cb, NULL))) { ORTE_ERROR_LOG(rc); return rc; } return ORTE_SUCCESS; }
/* * Construct the fullpath to the session directory */ int orte_session_dir_get_name(char **fulldirpath, char **return_prefix, /* This will come back as the valid tmp dir */ char **return_frontend, char *hostid, char *batchid, orte_process_name_t *proc) { char *hostname = NULL, *batchname = NULL, *sessions = NULL, *user = NULL, *prefix = NULL, *frontend = NULL, *jobfam = NULL, *job = NULL, *vpidstr = NULL; bool prefix_provided = false; int exit_status = ORTE_SUCCESS; size_t len; int uid; struct passwd *pwdent; /* Ensure that system info is set */ orte_proc_info(); /* get the name of the user */ uid = getuid(); #ifdef HAVE_GETPWUID pwdent = getpwuid(uid); #else pwdent = NULL; #endif if (NULL != pwdent) { user = strdup(pwdent->pw_name); } else { orte_show_help("help-orte-runtime.txt", "orte:session:dir:nopwname", true); return ORTE_ERR_OUT_OF_RESOURCE; } /* * set the 'hostname' */ if( NULL != hostid) { /* User specified version */ hostname = strdup(hostid); } else { /* check if it is set elsewhere */ if( NULL != orte_process_info.nodename) hostname = strdup(orte_process_info.nodename); else { /* Couldn't find it, so fail */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); exit_status = ORTE_ERR_BAD_PARAM; goto cleanup; } } /* * set the 'batchid' */ if (NULL != batchid) batchname = strdup(batchid); else batchname = strdup("0"); /* * get the front part of the session directory * Will look something like: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID */ if (NULL != orte_process_info.top_session_dir) { frontend = strdup(orte_process_info.top_session_dir); } else { /* If not set then construct it */ if (0 > asprintf(&frontend, "openmpi-sessions-%s@%s_%s", user, hostname, batchname)) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } } /* * Construct the session directory */ /* If we were given a valid vpid then we can construct it fully into: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID/VPID */ if( NULL != proc) { if (ORTE_VPID_INVALID != proc->vpid) { if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (ORTE_SUCCESS != orte_util_convert_vpid_to_string(&vpidstr, proc->vpid)) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } sessions = opal_os_path( false, frontend, jobfam, job, vpidstr, NULL ); if( NULL == sessions ) { ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } } /* If we were given a valid jobid then we can construct it partially into: * openmpi-sessions-USERNAME@HOSTNAME_BATCHID/JOB-FAMILY/JOBID */ else if (ORTE_JOBID_INVALID != proc->jobid) { if (0 > asprintf(&jobfam, "%d", ORTE_JOB_FAMILY(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } if (0 > asprintf(&job, "%d", ORTE_LOCAL_JOBID(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); exit_status = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } sessions = opal_os_path( false, frontend, jobfam, job, NULL ); if( NULL == sessions ) { ORTE_ERROR_LOG(ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; } } /* if both are invalid */ else { sessions = strdup(frontend); /* must dup this to avoid double-free later */ } } /* If we were not given a proc at all, then we just set it to frontend */ else { sessions = strdup(frontend); /* must dup this to avoid double-free later */ } /* * If the user specified an invalid prefix, or no prefix at all * we need to keep looking */ if( NULL != fulldirpath && NULL != *fulldirpath) { free(*fulldirpath); *fulldirpath = NULL; } if( NULL != return_prefix && NULL != *return_prefix) { /* use the user specified one, if available */ prefix = strdup(*return_prefix); prefix_provided = true; } /* Try to find a proper alternative prefix */ else if (NULL != orte_process_info.tmpdir_base) { /* stored value */ prefix = strdup(orte_process_info.tmpdir_base); } else { /* General Environment var */ prefix = strdup(opal_tmp_directory()); } len = strlen(prefix); /* check for a trailing path separator */ if (OPAL_PATH_SEP[0] == prefix[len-1]) { prefix[len-1] = '\0'; } /* BEFORE doing anything else, check to see if this prefix is * allowed by the system */ if (NULL != orte_prohibited_session_dirs) { char **list; int i, len; /* break the string into tokens - it should be * separated by ',' */ list = opal_argv_split(orte_prohibited_session_dirs, ','); len = opal_argv_count(list); /* cycle through the list */ for (i=0; i < len; i++) { /* check if prefix matches */ if (0 == strncmp(prefix, list[i], strlen(list[i]))) { /* this is a prohibited location */ orte_show_help("help-orte-runtime.txt", "orte:session:dir:prohibited", true, prefix, orte_prohibited_session_dirs); return ORTE_ERR_FATAL; } } opal_argv_free(list); /* done with this */ } /* * Construct the absolute final path, if requested */ if (NULL != fulldirpath) { *fulldirpath = opal_os_path(false, prefix, sessions, NULL); } /* * Return the frontend and prefix, if user requested we do so */ if (NULL != return_frontend) { *return_frontend = strdup(frontend); } if (!prefix_provided && NULL != return_prefix) { *return_prefix = strdup(prefix); } cleanup: if(NULL != hostname) free(hostname); if(NULL != batchname) free(batchname); if(NULL != sessions) free(sessions); if(NULL != user) free(user); if (NULL != prefix) free(prefix); if (NULL != frontend) free(frontend); if (NULL != jobfam) free(jobfam); if (NULL != job) free(job); if (NULL != vpidstr) free(vpidstr); return exit_status; }
static int route_lost(const orte_process_name_t *route) { opal_list_item_t *item; orte_routed_tree_t *child; orte_routed_jobfam_t *jfam; uint16_t jfamily; int i; OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s route to %s lost", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(route))); /* if the route is to a different job family and we are the HNP, look it up */ if ((ORTE_JOB_FAMILY(route->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) && ORTE_PROC_IS_HNP) { jfamily = ORTE_JOB_FAMILY(route->jobid); for (i=0; i < orte_routed_jobfams.size; i++) { if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) { continue; } if (jfam->job_family == jfamily) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_radix: route to %s lost", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOB_FAMILY_PRINT(route->jobid))); opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL); OBJ_RELEASE(jfam); break; } } } /* if we lose the connection to the lifeline and we are NOT already, * in finalize, tell the OOB to abort. * NOTE: we cannot call abort from here as the OOB needs to first * release a thread-lock - otherwise, we will hang!! */ if (!orte_finalizing && NULL != lifeline && OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, route, lifeline)) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed:radix: Connection to lifeline %s lost", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(lifeline))); return ORTE_ERR_FATAL; } /* if we are the HNP or daemon, and the route is a daemon, * see if it is one of our children - if so, remove it */ if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) && route->jobid == ORTE_PROC_MY_NAME->jobid) { for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; if (child->vpid == route->vpid) { opal_list_remove_item(&my_children, item); OBJ_RELEASE(item); return ORTE_SUCCESS; } } } /* we don't care about this one, so return success */ return ORTE_SUCCESS; }
static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; opal_list_item_t *item; orte_routed_tree_t *child; int i; orte_routed_jobfam_t *jfam; uint16_t jfamily; if (!orte_routing_is_enabled) { ret = target; goto found; } /* initialize */ daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { ret = ORTE_NAME_INVALID; goto found; } /* if it is me, then the route is just direct */ if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { ret = target; goto found; } /* if I am an application process, always route via my local daemon */ if (ORTE_PROC_IS_APP) { ret = ORTE_PROC_MY_DAEMON; goto found; } /* if I am a tool, the route is direct if target is in * my own job family, and to the target's HNP if not */ if (ORTE_PROC_IS_TOOL) { if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { ret = target; goto found; } else { ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid); ret = &daemon; goto found; } } /****** HNP AND DAEMONS ONLY ******/ /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, route this via the HNP */ if (ORTE_PROC_IS_DAEMON) { ret = ORTE_PROC_MY_HNP; goto found; } /* if I am the HNP or a tool, then I stored a route to * this job family, so look it up */ jfamily = ORTE_JOB_FAMILY(target->jobid); for (i=0; i < orte_routed_jobfams.size; i++) { if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) { continue; } if (jfam->job_family == jfamily) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_binomial: route to %s found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOB_FAMILY_PRINT(target->jobid))); ret = &jfam->route; goto found; } } /* not found - so we have no route */ ret = ORTE_NAME_INVALID; goto found; } /* THIS CAME FROM OUR OWN JOB FAMILY... */ /* if this is going to the HNP, then send it direct if we don't know * how to get there - otherwise, send it via the tree */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { if (!hnp_direct || orte_static_ports) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routing to the HNP through my parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_PARENT))); ret = ORTE_PROC_MY_PARENT; goto found; } else { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routing direct to the HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ret = ORTE_PROC_MY_HNP; goto found; } } daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = ORTE_NAME_INVALID; goto found; } /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; goto found; } else if (orte_process_info.num_procs < mca_routed_radix_component.max_connections) { /* if the job is small enough, send direct to the target's daemon */ ret = &daemon; goto found; } else { /* search routing tree for next step to that daemon */ for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; if (child->vpid == daemon.vpid) { /* the child is hosting the proc - just send it there */ ret = &daemon; goto found; } /* otherwise, see if the daemon we need is below the child */ if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { /* yep - we need to step through this child */ daemon.vpid = child->vpid; ret = &daemon; goto found; } } } /* if we get here, then the target daemon is not beneath * any of our children, so we have to step up through our parent */ daemon.vpid = ORTE_PROC_MY_PARENT->vpid; ret = &daemon; found: OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(ret))); return *ret; }
static int delete_route(orte_process_name_t *proc) { int i; orte_routed_jobfam_t *jfam; uint16_t jfamily; if (proc->jobid == ORTE_JOBID_INVALID || proc->vpid == ORTE_VPID_INVALID) { return ORTE_ERR_BAD_PARAM; } /* if I am an application process, I don't have any routes * so there is nothing for me to do */ if (!ORTE_PROC_IS_HNP && !ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_TOOL) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_delete_route for %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if this is from a different job family, then I need to * look it up appropriately */ if (ORTE_JOB_FAMILY(proc->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, then I will automatically route * anything to this job family via my HNP - so I have nothing * in my routing table and thus have nothing to do * here, just return */ if (ORTE_PROC_IS_DAEMON) { return ORTE_SUCCESS; } /* see if this job family is present */ jfamily = ORTE_JOB_FAMILY(proc->jobid); for (i=0; i < orte_routed_jobfams.size; i++) { if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) { continue; } if (jfam->job_family == jfamily) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_binomial: deleting route to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOB_FAMILY_PRINT(proc->jobid))); opal_pointer_array_set_item(&orte_routed_jobfams, i, NULL); OBJ_RELEASE(jfam); return ORTE_SUCCESS; } } /* not present - nothing to do */ return ORTE_SUCCESS; } /* THIS CAME FROM OUR OWN JOB FAMILY...there is nothing * to do here. The routes will be redefined when we update * the routing tree */ return ORTE_SUCCESS; }
static int update_route(orte_process_name_t *target, orte_process_name_t *route) { int rc; orte_process_name_t * route_copy; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { return ORTE_ERR_BAD_PARAM; } /* if I am an application process, we don't update the route since * we automatically route everything through the local daemon */ if (ORTE_PROC_IS_APP) { return ORTE_SUCCESS; } /* if the job family is zero, then this is going to a local slave, * so the path is direct and there is nothing to do here */ if (0 == ORTE_JOB_FAMILY(target->jobid)) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear_update: %s --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(route))); /* if this is from a different job family, then I need to * track how to send messages to it */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, then I will automatically route * anything to this job family via my HNP - so nothing to do * here, just return */ if (ORTE_PROC_IS_DAEMON) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear_update: diff job family routing job %s --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(target->jobid), ORTE_NAME_PRINT(route))); /* see if this target is already present - it will have a wildcard vpid, * so we have to look for it with that condition */ rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), (void**)&route_copy); if (ORTE_SUCCESS == rc && NULL != route_copy) { /* target already present - update the route info * in case it has changed */ *route_copy = *route; rc = opal_hash_table_set_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), route_copy); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } return rc; } /* not there, so add the route FOR THE JOB FAMILY*/ route_copy = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); *route_copy = *route; rc = opal_hash_table_set_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), route_copy); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } return rc; } /* THIS CAME FROM OUR OWN JOB FAMILY... */ opal_output(0, "%s CALL TO UPDATE ROUTE FOR OWN JOB FAMILY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERR_NOT_SUPPORTED; }
void orte_rmcast_base_process_msg(orte_rmcast_msg_t *msg) { orte_rmcast_channel_t channel; rmcast_base_recv_t *ptr, *recv=NULL; orte_process_name_t name; orte_rmcast_tag_t tag; int8_t flag; struct iovec *iovec_array=NULL; int32_t iovec_count=0, i, n, isz; int rc=ORTE_SUCCESS; orte_rmcast_seq_t recvd_seq_num; opal_list_item_t *item; rmcast_seq_tracker_t *trkr, *tptr; rmcast_recv_log_t *log, *logptr; bool restart; opal_buffer_t alert; /* extract the header */ if (ORTE_SUCCESS != (rc = extract_hdr(msg->buf, &name, &channel, &tag, &restart, &recvd_seq_num))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if this message is from myself, ignore it */ if (name.jobid == ORTE_PROC_MY_NAME->jobid && name.vpid == ORTE_PROC_MY_NAME->vpid) { OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv sent from myself: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name))); goto cleanup; } /* if this is a heartbeat and I am not a daemon, then ignore it * to avoid swamping tools */ if (!ORTE_PROC_IS_DAEMON && ORTE_RMCAST_TAG_HEARTBEAT == tag) { OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv ignoring heartbeat", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this message is from a different job family, ignore it unless * it is on the system channel. We ignore these messages to avoid * confusion between different jobs since we all may be sharing * multicast channels. The system channel is left open to support * cross-job communications for detecting multiple conflicting DVMs. */ if (ORTE_JOB_FAMILY(name.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) && (ORTE_RMCAST_SYS_CHANNEL != channel)) { /* if we are not the HNP or a daemon, then we ignore this */ if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv from a different job family: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name))); } else { goto cleanup; } } if (orte_rmcast_base.unreliable_xport) { /* if the message is not on a system-specified channel, then check to see if we * are missing any messages and need a resend */ if (ORTE_RMCAST_DYNAMIC_CHANNELS <= channel) { log = NULL; for (item = opal_list_get_first(&orte_rmcast_base.msg_logs); item != opal_list_get_end(&orte_rmcast_base.msg_logs); item = opal_list_get_next(item)) { logptr = (rmcast_recv_log_t*)item; /* look for this source */ if (name.jobid == logptr->name.jobid && name.vpid == logptr->name.vpid) { log = logptr; break; } } if (NULL == log) { /* new source */ log = OBJ_NEW(rmcast_recv_log_t); log->name.jobid = name.jobid; log->name.vpid = name.vpid; opal_list_append(&orte_rmcast_base.msg_logs, &log->super); } /* look for the channel */ trkr = NULL; for (item = opal_list_get_first(&log->last_msg); item != opal_list_get_end(&log->last_msg); item = opal_list_get_next(item)) { tptr = (rmcast_seq_tracker_t*)item; if (channel == tptr->channel) { trkr = tptr; break; } } if (NULL == trkr) { /* new channel */ trkr = OBJ_NEW(rmcast_seq_tracker_t); trkr->channel = channel; opal_list_append(&log->last_msg, &trkr->super); OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, "%s NEW CHANNEL: %d SENDER: %s SEQ %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, ORTE_NAME_PRINT(&log->name), recvd_seq_num)); } else if (ORTE_RMCAST_SEQ_INVALID != trkr->seq_num && !restart) { /* if this is a repeat msg, ignore it */ if (recvd_seq_num <= trkr->seq_num) { OPAL_OUTPUT_VERBOSE((1, orte_rmcast_base.rmcast_output, "%s Repeat msg %d on channel %d from source %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recvd_seq_num, channel, ORTE_NAME_PRINT(&name))); } if (1 != (recvd_seq_num - trkr->seq_num) || (ORTE_RMCAST_SEQ_MAX == trkr->seq_num && 0 != recvd_seq_num)) { /* missing a message - request it */ OPAL_OUTPUT_VERBOSE((1, orte_rmcast_base.rmcast_output, "%s Missing msg %d (%d) on channel %d from source %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recvd_seq_num, trkr->seq_num, channel, ORTE_NAME_PRINT(&name))); OBJ_CONSTRUCT(&alert, opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &channel, 1, ORTE_RMCAST_CHANNEL_T))) { ORTE_ERROR_LOG(rc); exit(1); } if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &trkr->seq_num, 1, ORTE_RMCAST_SEQ_T))) { ORTE_ERROR_LOG(rc); exit(1); } if (0 > (rc = orte_rml.send_buffer(&name, &alert, ORTE_RML_TAG_MISSED_MSG, 0))) { ORTE_ERROR_LOG(rc); exit(1); } OBJ_DESTRUCT(&alert); goto cleanup; } OPAL_OUTPUT_VERBOSE((10, orte_rmcast_base.rmcast_output, "%s CHANNEL: %d SENDER: %s SEQ: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), channel, ORTE_NAME_PRINT(&log->name), recvd_seq_num)); } trkr->seq_num = recvd_seq_num; } } /* unpack the iovec vs buf flag */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &flag, &n, OPAL_INT8))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv sender: %s channel: %d tag: %d %s seq_num: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), channel, (int)tag, (0 == flag) ? "iovecs" : "buffer", recvd_seq_num)); /* find the recv for this channel, tag, and type */ ORTE_ACQUIRE_THREAD(&orte_rmcast_base.main_ctl); for (item = opal_list_get_first(&orte_rmcast_base.recvs); item != opal_list_get_end(&orte_rmcast_base.recvs); item = opal_list_get_next(item)) { ptr = (rmcast_base_recv_t*)item; OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv checking channel %d tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)ptr->channel, (int)ptr->tag)); if (channel != ptr->channel) { continue; } if (tag != ptr->tag && ORTE_RMCAST_TAG_WILDCARD != ptr->tag) { continue; } ptr->seq_num = recvd_seq_num; recv = ptr; break; } if (NULL == recv) { /* recv not found - dump msg */ ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl); goto cleanup; } if (!(ORTE_RMCAST_PERSISTENT & recv->flags)) { OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv removing non-persistent recv", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); opal_list_remove_item(&orte_rmcast_base.recvs, &recv->item); } ORTE_RELEASE_THREAD(&orte_rmcast_base.main_ctl); OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv delivering message to channel %d tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv->channel, (int)tag)); /* we have a matching recv - unpack the data */ if (0 == flag) { /* get the number of iovecs in the buffer */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &iovec_count, &n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* malloc the required space */ iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); /* unpack the iovecs */ for (i=0; i < iovec_count; i++) { /* unpack the number of bytes in this iovec */ n=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, &isz, &n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } iovec_array[i].iov_base = NULL; iovec_array[i].iov_len = isz; if (0 < isz) { /* allocate the space */ iovec_array[i].iov_base = (IOVBASE_TYPE*)malloc(isz); /* unpack the data */ if (ORTE_SUCCESS != (rc = opal_dss.unpack(msg->buf, iovec_array[i].iov_base, &isz, OPAL_UINT8))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } if (NULL != recv->cbfunc_iovec) { OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv delivering iovecs to channel %d tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv->channel, (int)tag)); recv->cbfunc_iovec(ORTE_SUCCESS, recv->channel, recv->seq_num, tag, &name, iovec_array, iovec_count, recv->cbdata); } else { /* if something is already present, then we have a problem */ if (NULL != recv->iovec_array) { OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv blocking recv already fulfilled", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* copy over the iovec array since it will be released by * the blocking recv */ recv->iovec_array = (struct iovec *)malloc(iovec_count * sizeof(struct iovec)); recv->iovec_count = iovec_count; for (i=0; i < iovec_count; i++) { recv->iovec_array[i].iov_base = (IOVBASE_TYPE*)malloc(iovec_array[i].iov_len); recv->iovec_array[i].iov_len = iovec_array[i].iov_len; memcpy(recv->iovec_array[i].iov_base, iovec_array[i].iov_base, iovec_array[i].iov_len); } /* release blocking recv */ ORTE_WAKEUP_THREAD(&recv->ctl); } } else { if (NULL != recv->cbfunc_buffer) { OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv delivering buffer to channel %d tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv->channel, (int)tag)); recv->cbfunc_buffer(ORTE_SUCCESS, recv->channel, recv->seq_num, tag, &name, msg->buf, recv->cbdata); } else { /* if something is already present, then we have a problem */ if (NULL != recv->buf) { OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv blocking recv already fulfilled", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_rmcast_base.rmcast_output, "%s rmcast:base:process_recv copying buffer for blocking recv", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* copy the buffer across since it will be released * by the blocking recv */ recv->buf = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(recv->buf, msg->buf))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* release blocking recv */ ORTE_WAKEUP_THREAD(&recv->ctl); } } cleanup: if (NULL != iovec_array) { for (i=0; i < iovec_count; i++) { free(iovec_array[i].iov_base); } free(iovec_array); iovec_array = NULL; iovec_count = 0; } if (NULL != msg) { OBJ_RELEASE(msg); } if (NULL != recv && !(ORTE_RMCAST_PERSISTENT & recv->flags)) { OBJ_RELEASE(recv); } return; }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { int rc=ORTE_SUCCESS, i; orte_app_context_t *app; orte_node_t *node; orte_proc_t *pptr, *daemon, *pptr2; opal_buffer_t *notify; orcm_triplet_t *trp; orcm_source_t *src; bool procs_recovered; orte_job_t *jdt; uint16_t jfam; bool send_msg; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:update_state for job %s proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); /* protect against threads */ ORTE_ACQUIRE_THREAD(&ctl); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* should only get this if a daemon restarted and we need * to check for procs waiting to migrate */ if (ORTE_JOB_STATE_PROCS_MIGRATING != jobstate) { /* we should never get this situation */ opal_output(0, "%s UNKNOWN JOB ERROR ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_job_state_to_str(jobstate)); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERROR; } /* cycle thru all known jobs looking for those with procs * awaiting resources to migrate */ for (i=0; i < orte_job_data->size; i++) { if (NULL == (jdt = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { continue; } if (ORTE_JOB_STATE_PROCS_MIGRATING != jdt->state) { continue; } /* reset the job */ orte_plm_base_reset_job(jdt); /* map the job again */ if (ORTE_SUCCESS != (rc = orte_rmaps.map_job(jdt))) { ORTE_ERROR_LOG(rc); continue; } /* launch any procs that could be mapped - note that not * all procs that were waiting for migration may have * been successfully mapped, so this could in fact * result in no action by the daemons */ notify = OBJ_NEW(opal_buffer_t); /* indicate the target DVM */ jfam = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); opal_dss.pack(notify, &jfam, 1, OPAL_UINT16); /* get the launch data */ if (ORTE_SUCCESS != (rc = orte_odls.get_add_procs_data(notify, jdt->jobid))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(notify); ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /* send it to the daemons */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_COMMAND, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /**** DEAL WITH INDIVIDUAL PROCS ****/ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:sched got state %s for proc %s pid %d exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc), pid, exit_code)); /* if this was a failed comm or heartbeat */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* ignore this */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { /* get the proc object for this daemon */ if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } /* ensure that the heartbeat system knows to ignore this proc * from this point forward */ daemon->beat = 0; /* if we have already heard about this proc, ignore repeats */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == daemon->state) { /* already heard */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } #if 0 /* delete the route */ orte_routed.delete_route(proc); /* purge the oob */ orte_rml.purge(proc); #endif /* get the triplet/source and mark this source as "dead" */ if (NULL == (trp = orcm_get_triplet_stringid("orcmd:0.1:alpha"))) { opal_output(0, "%s CANNOT FIND DAEMON TRIPLET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } if (NULL == (src = orcm_get_source(trp, proc, false))) { opal_output(0, "%s DAEMON %s IS UNKNOWN SOURCE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); ORTE_RELEASE_THREAD(&trp->ctl); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } src->alive = false; ORTE_RELEASE_THREAD(&src->ctl); ORTE_RELEASE_THREAD(&trp->ctl); /* notify all apps immediately */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } notify = OBJ_NEW(opal_buffer_t); send_msg = false; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_RELEASE_THREAD(&ctl); return rc; } /* reset the proc stats */ OBJ_DESTRUCT(&pptr->stats); OBJ_CONSTRUCT(&pptr->stats, opal_pstats_t); /* since we added something, need to send msg */ send_msg = true; } if (send_msg) { /* send it to all apps */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL, ORCM_PNP_TAG_ERRMGR, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } else { OBJ_RELEASE(notify); } /* reset the node stats */ OBJ_DESTRUCT(&node->stats); OBJ_CONSTRUCT(&node->stats, opal_node_stats_t); /* record that the daemon died */ daemon->state = state; daemon->exit_code = exit_code; daemon->pid = 0; /* reset the daemon stats */ OBJ_DESTRUCT(&daemon->stats); OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t); node = daemon->node; if (NULL == node) { opal_output(0, "%s Detected failure of daemon %s on unknown node", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); /* can't do anything further */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } else { opal_output(0, "%s Detected failure of daemon %s on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), (NULL == node->name) ? "UNKNOWN" : node->name); } /* see if any usable daemons are left alive */ procs_recovered = false; for (i=2; i < daemon_job->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, i))) { continue; } if (ORTE_PROC_STATE_UNTERMINATED < pptr->state) { continue; } /* at least one alive! recover procs from the failed one */ recover_procs(proc); procs_recovered = true; break; } if (!procs_recovered) { daemon->node = NULL; node->state = ORTE_NODE_STATE_DOWN; node->daemon = NULL; /* mark all procs on this node as having terminated */ for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } /* get the job data object for this process */ if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) { /* major problem */ opal_output(0, "%s COULD NOT GET JOB OBJECT FOR PROC %s(%d): state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), i, orte_proc_state_to_str(pptr->state)); continue; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); app->num_procs--; opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL); OBJ_RELEASE(pptr); /* clean it off the node */ opal_pointer_array_set_item(node->procs, i, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(pptr); /* see if job is empty */ jdt->num_terminated++; if (jdt->num_procs <= jdt->num_terminated) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING JOB %s FROM ACTIVE ARRAY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid))); opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL); OBJ_RELEASE(jdt); } } } ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } if (ORTE_PROC_STATE_RESTARTED == state) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s RESTART OF DAEMON %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* get the proc object for this daemon */ if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(daemon_job->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } /* if apps were on that node, notify all apps immediately that * those procs have failed */ if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_RELEASE_THREAD(&ctl); return ORTE_ERR_NOT_FOUND; } notify = OBJ_NEW(opal_buffer_t); send_msg = false; for (i=0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (ORTE_SUCCESS != (rc = opal_dss.pack(notify, &pptr->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); ORTE_RELEASE_THREAD(&ctl); return rc; } /* since we added something, we need to send msg */ send_msg = true; /* remove the proc from the app so that it will get * restarted when we re-activate the config */ if (NULL == (jdt = orte_get_job_data_object(pptr->name.jobid))) { continue; } if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdt->apps, pptr->app_idx))) { continue; } OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING PROC %s FROM NODE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); app->num_procs--; opal_pointer_array_set_item(jdt->procs, pptr->name.vpid, NULL); OBJ_RELEASE(pptr); /* clean it off the node */ opal_pointer_array_set_item(node->procs, i, NULL); node->num_procs--; /* maintain acctg */ OBJ_RELEASE(pptr); /* see if job is empty */ jdt->num_terminated++; if (jdt->num_procs <= jdt->num_terminated) { OPAL_OUTPUT_VERBOSE((3, orte_errmgr_base.output, "%s REMOVING JOB %s FROM ACTIVE ARRAY", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdt->jobid))); opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdt->jobid), NULL); OBJ_RELEASE(jdt); } } if (send_msg) { /* send it to all apps */ if (ORCM_SUCCESS != (rc = orcm_pnp.output_nb(ORCM_PNP_ERROR_CHANNEL, NULL, ORCM_PNP_TAG_ERRMGR, NULL, 0, notify, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } } else { OBJ_RELEASE(notify); } /* reset the node stats */ OBJ_DESTRUCT(&node->stats); OBJ_CONSTRUCT(&node->stats, opal_node_stats_t); /* reset the daemon stats */ OBJ_DESTRUCT(&daemon->stats); OBJ_CONSTRUCT(&daemon->stats, opal_pstats_t); /* don't restart procs - we'll do that later after * we allow time for multiple daemons to restart */ ORTE_RELEASE_THREAD(&ctl); return ORTE_SUCCESS; } /* to arrive here is an error */ opal_output(0, "%s GOT UNRECOGNIZED STATE %s FOR PROC %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc)); return ORTE_ERROR; }
void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) { mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata; int rc; orte_process_name_t hop; mca_oob_tcp_peer_t *relay; uint64_t ui64; if (orte_abnormal_term_ordered) { return; } opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_TCP_CONNECT_ACK: if (ORTE_SUCCESS == (rc = mca_oob_tcp_peer_recv_connect_ack(peer, peer->sd, NULL))) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler starting send/recv events", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* we connected! Start the send/recv events */ if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); peer->recv_ev_active = true; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } /* if there is a message waiting to be sent, queue it */ if (NULL == peer->send_msg) { peer->send_msg = (mca_oob_tcp_send_t*)opal_list_remove_first(&peer->send_queue); } if (NULL != peer->send_msg && !peer->send_ev_active) { opal_event_add(&peer->send_event, 0); peer->send_ev_active = true; } /* update our state */ peer->state = MCA_OOB_TCP_CONNECTED; } else { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); opal_event_del(&peer->recv_event); ORTE_FORCED_TERMINATE(1); return; } break; case MCA_OOB_TCP_CONNECTED: opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler CONNECTED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* allocate a new message and setup for recv */ if (NULL == peer->recv_msg) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler allocate new recv msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); peer->recv_msg = OBJ_NEW(mca_oob_tcp_recv_t); if (NULL == peer->recv_msg) { opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); return; } /* start by reading the header */ peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr; peer->recv_msg->rdbytes = sizeof(mca_oob_tcp_hdr_t); } /* if the header hasn't been completely read, read it */ if (!peer->recv_msg->hdr_recvd) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler read hdr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (ORTE_SUCCESS == (rc = read_bytes(peer))) { OPAL_TIMING_EVENT((&tm,"Header received from %s", ORTE_NAME_PRINT(&peer->name))); /* completed reading the header */ peer->recv_msg->hdr_recvd = true; /* convert the header */ MCA_OOB_TCP_HDR_NTOH(&peer->recv_msg->hdr); /* if this is a zero-byte message, then we are done */ if (0 == peer->recv_msg->hdr.nbytes) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); peer->recv_msg->data = NULL; // make sure } else { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler allocate data region of size %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes); /* allocate the data region */ peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes); /* point to it */ peer->recv_msg->rdptr = peer->recv_msg->data; peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes; } /* fall thru and attempt to read the data */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { /* close the connection */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler error reading bytes - closing connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_tcp_peer_close(peer); return; } } if (peer->recv_msg->hdr_recvd) { /* continue to read the data block - we start from * wherever we left off, which could be at the * beginning or somewhere in the message */ if (ORTE_SUCCESS == (rc = read_bytes(peer))) { OPAL_TIMING_EVENT((&tm,"Msg received from %s", ORTE_NAME_PRINT(&peer->name))); /* we recvd all of the message */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD COMPLETE MESSAGE FROM %s (ORIGIN %s) OF %d BYTES FOR DEST %s TAG %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin), (int)peer->recv_msg->hdr.nbytes, ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), peer->recv_msg->hdr.tag); /* am I the intended recipient (header was already converted back to host order)? */ if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); } else { /* no - find the next hop in the route */ hop = orte_routed.get_route(&peer->recv_msg->hdr.dst); if (hop.jobid == ORTE_JOBID_INVALID || hop.vpid == ORTE_VPID_INVALID) { /* no hop known - post the error to the component * and let the OOB see if there is another way * to get there from here */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s NO ROUTE TO %s FROM HERE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); /* let the component know about the problem */ ORTE_ACTIVATE_TCP_MSG_ERROR(NULL, peer->recv_msg, &hop, mca_oob_tcp_component_no_route); /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } else { /* does we know how to reach the next hop? */ memcpy(&ui64, (char*)&hop, sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_module.peers, ui64, (void**)&relay)) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s ADDRESS OF NEXT HOP %s TO %s IS UNKNOWN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&hop), ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst)); /* let the component know about the problem */ ORTE_ACTIVATE_TCP_MSG_ERROR(NULL, peer->recv_msg, &hop, mca_oob_tcp_component_hop_unknown); /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s ROUTING TO %s FROM HERE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&relay->name)); /* if this came from a different job family, then ensure * we know how to return */ if (ORTE_JOB_FAMILY(peer->recv_msg->hdr.origin.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { orte_routed.update_route(&(peer->recv_msg->hdr.origin), &peer->name); } /* post the message for retransmission */ MCA_OOB_TCP_QUEUE_RELAY(peer->recv_msg, relay); OBJ_RELEASE(peer->recv_msg); } } peer->recv_msg = NULL; return; } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to recv message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* turn off the recv event */ opal_event_del(&peer->recv_event); ORTE_FORCED_TERMINATE(1); return; } } break; default: opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); // mca_oob_tcp_peer_close(peer); break; } }
int orte_rml_base_update_contact_info(opal_buffer_t* data) { orte_std_cntr_t cnt; orte_vpid_t num_procs; char *rml_uri; orte_process_name_t name; bool got_name; int rc; /* unpack the data for each entry */ num_procs = 0; name.jobid = ORTE_JOBID_INVALID; got_name = false; cnt = 1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(data, &rml_uri, &cnt, OPAL_STRING))) { OPAL_OUTPUT_VERBOSE((5, orte_rml_base_framework.framework_output, "%s rml:base:update:contact:info got uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), NULL == rml_uri ? "NULL" : rml_uri)); if (NULL != rml_uri) { /* set the contact info into the hash table */ orte_rml.set_contact_info(rml_uri); if (!got_name) { /* we only get an update from a single jobid - the command * that creates these doesn't cross jobid boundaries - so * record it here */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(rml_uri, &name, NULL))) { ORTE_ERROR_LOG(rc); free(rml_uri); return rc; } got_name = true; /* if this is for a different job family, update the route to this proc */ if (ORTE_JOB_FAMILY(name.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { if (ORTE_SUCCESS != (rc = orte_routed.update_route(&name, &name))) { ORTE_ERROR_LOG(rc); free(rml_uri); return rc; } } } free(rml_uri); } /* track how many procs were in the message */ ++num_procs; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); return rc; } /* if we are a daemon and this was info about our jobid, this update would * include updated contact info * for all daemons in the system - indicating that the number of daemons * changed since we were initially launched. Thus, update the num_procs * in our process_info struct so we can correctly route any messages */ if (ORTE_PROC_MY_NAME->jobid == name.jobid && ORTE_PROC_IS_DAEMON && orte_process_info.num_procs < num_procs) { orte_process_info.num_procs = num_procs; if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /* if we changed it, then we better update the routing * plan so daemon collectives work correctly */ orte_routed.update_routing_plan(); } return ORTE_SUCCESS; }
static int delete_route(orte_process_name_t *proc) { int rc; orte_process_name_t *route_copy; if (proc->jobid == ORTE_JOBID_INVALID || proc->vpid == ORTE_VPID_INVALID) { return ORTE_ERR_BAD_PARAM; } /* if I am an application process, I don't have any routes * so there is nothing for me to do */ if (!ORTE_PROC_IS_HNP && !ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_TOOL) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_binomial_delete_route for %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if this is from a different job family, then I need to * look it up appropriately */ if (ORTE_JOB_FAMILY(proc->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, then I will automatically route * anything to this job family via my HNP - so I have nothing * in my routing table and thus have nothing to do * here, just return */ if (ORTE_PROC_IS_DAEMON) { return ORTE_SUCCESS; } /* see if this proc is present - it will have a wildcard vpid, * so we have to look for it with that condition */ rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(proc->jobid), (void**)&route_copy); if (ORTE_SUCCESS == rc && NULL != route_copy) { /* proc is present - remove the data */ free(route_copy); rc = opal_hash_table_remove_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(proc->jobid)); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); } return rc; } /* not present - nothing to do */ return ORTE_SUCCESS; } /* THIS CAME FROM OUR OWN JOB FAMILY...there is nothing * to do here. The routes will be redefined when we update * the routing tree */ return ORTE_SUCCESS; }
static int rte_init(void) { int ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; char *rmluri; opal_value_t *kv; char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; char **peers=NULL, *mycpuset, **cpusets=NULL; opal_process_name_t name; size_t i; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_progress_thread_init(NULL); progress_thread_running = true; /* open and setup pmix */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { ORTE_ERROR_LOG(ret); /* we cannot run */ error = "pmix init"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) { /* we cannot run */ error = "pmix init"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) { /* we cannot run */ error = "pmix init"; goto error; } u32ptr = &u32; u16ptr = &u16; /**** THE FOLLOWING ARE REQUIRED VALUES ***/ /* pmix.init set our process name down in the OPAL layer, * so carry it forward here */ ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; /* get our local rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting local rank"; goto error; } orte_process_info.my_local_rank = u16; /* get our node rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting node rank"; goto error; } orte_process_info.my_node_rank = u16; /* get max procs */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting max procs"; goto error; } orte_process_info.max_procs = u32; /* get job size */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_JOB_SIZE, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting job size"; goto error; } orte_process_info.num_procs = u32; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.app_num = u32; } else { orte_process_info.app_num = 0; } /* get the number of local peers - required for wireup of * shared memory BTL */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.num_local_peers = u32 - 1; // want number besides ourselves } else { orte_process_info.num_local_peers = 0; } /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } opal_output_verbose(2, orte_ess_base_framework.framework_output, "%s transport key %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), string_key); asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } /* retrieve our topology */ val = NULL; OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* load the topology */ if (0 != hwloc_topology_init(&opal_hwloc_topology)) { ret = OPAL_ERROR; free(val); error = "setting topology"; goto error; } if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) { ret = OPAL_ERROR; free(val); hwloc_topology_destroy(opal_hwloc_topology); error = "setting topology"; goto error; } /* since we are loading this from an external source, we have to * explicitly set a flag so hwloc sets things up correctly */ if (0 != hwloc_topology_set_flags(opal_hwloc_topology, (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } /* now load the topology */ if (0 != hwloc_topology_load(opal_hwloc_topology)) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } free(val); /* filter the cpus thru any default cpu set */ if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) { error = "filtering topology"; goto error; } } else { /* it wasn't passed down to us, so go get it */ if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "topology discovery"; goto error; } /* push it into the PMIx database in case someone * tries to retrieve it so we avoid an attempt to * get it again */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCAL_TOPO); kv->type = OPAL_STRING; if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) { error = "topology export"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, kv))) { error = "topology store"; goto error; } OBJ_RELEASE(kv); } /* get our local peers */ if (0 < orte_process_info.num_local_peers) { /* if my local rank if too high, then that's an error */ if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) { ret = ORTE_ERR_BAD_PARAM; error = "num local peers"; goto error; } /* retrieve the local peers */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { peers = opal_argv_split(val, ','); free(val); /* and their cpusets, if available */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { cpusets = opal_argv_split(val, ':'); free(val); } else { cpusets = NULL; } } else { peers = NULL; cpusets = NULL; } } else { peers = NULL; cpusets = NULL; } /* set the locality */ if (NULL != peers) { /* indentify our cpuset */ if (NULL != cpusets) { mycpuset = cpusets[orte_process_info.my_local_rank]; } else { mycpuset = NULL; } name.jobid = ORTE_PROC_MY_NAME->jobid; for (i=0; NULL != peers[i]; i++) { kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCALITY); kv->type = OPAL_UINT16; name.vpid = strtoul(peers[i], NULL, 10); if (name.vpid == ORTE_PROC_MY_NAME->vpid) { /* we are fully local to ourselves */ u16 = OPAL_PROC_ALL_LOCAL; } else if (NULL == mycpuset || NULL == cpusets[i] || 0 == strcmp(cpusets[i], "UNBOUND")) { /* all we can say is that it shares our node */ u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* we have it, so compute the locality */ u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, mycpuset, cpusets[i]); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, "%s ess:pmi:locality: proc %s locality %x", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), u16)); kv->data.uint16 = u16; ret = opal_pmix.store_local(&name, kv); if (OPAL_SUCCESS != ret) { error = "local store of locality"; opal_argv_free(peers); opal_argv_free(cpusets); goto error; } OBJ_RELEASE(kv); } opal_argv_free(peers); opal_argv_free(cpusets); } /* now that we have all required info, complete the setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } /* this needs to be set to enable debugger use when direct launched */ if (NULL == orte_process_info.my_daemon_uri) { orte_standalone_operation = true; } /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /*** PUSH DATA FOR OTHERS TO FIND ***/ /* push our RML URI in case others need to talk directly to us */ rmluri = orte_rml.get_contact_info(); /* push it out for others to use */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_PROC_URI, rmluri, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "pmix put uri"; goto error; } free(rmluri); /* push our hostname so others can find us, if they need to */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "db store hostname"; goto error; } /* if we are an ORTE app - and not an MPI app - then * we need to exchange our connection info here. * MPI_Init has its own modex, so we don't need to do * two of them. However, if we don't do a modex at all, * then processes have no way to communicate * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { opal_pmix.fence(NULL, 0); } return ORTE_SUCCESS; error: if (!progress_thread_running) { /* can't send the help message, so ensure it * comes out locally */ orte_show_help_finalize(); } if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; int rc; /* if it is me, then the route is just direct */ if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { ret = target; goto found; } /* if I am an application process, always route via my local daemon */ if (ORTE_PROC_IS_APP) { ret = ORTE_PROC_MY_DAEMON; goto found; } /****** HNP AND DAEMONS ONLY ******/ /* if the job family is zero, then this is going to a local slave, * so the path is direct */ if (0 == ORTE_JOB_FAMILY(target->jobid)) { ret = target; goto found; } /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, route this via the HNP */ if (ORTE_PROC_IS_DAEMON) { ret = ORTE_PROC_MY_HNP; goto found; } /* if I am the HNP or a tool, then I stored a route to * this job family, so look it up */ rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), (void**)&ret); if (ORTE_SUCCESS == rc) { /* got a good result - return it */ goto found; } /* not found - so we have no route */ ret = ORTE_NAME_INVALID; goto found; } /* THIS CAME FROM OUR OWN JOB FAMILY... */ /* if we are not using static ports and this is going to the HNP, send direct */ if (!orte_static_ports && ORTE_PROC_MY_HNP->jobid == target->jobid && ORTE_PROC_MY_HNP->vpid == target->vpid) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, "%s routing not enabled - going direct", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ret = target; goto found; } daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = ORTE_NAME_INVALID; goto found; } /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; } else { /* the linear routing tree is trivial - if the vpid is * lower than mine, route through my parent, which is * at my_vpid-1. If the vpid is higher than mine, then * route to my_vpid+1, wrapping around to 0 */ if (daemon.vpid < ORTE_PROC_MY_NAME->vpid) { daemon.vpid = ORTE_PROC_MY_NAME->vpid - 1; ret = &daemon; } else { if (ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) { daemon.vpid = ORTE_PROC_MY_NAME->vpid + 1; } else { /* we are at end of chain - wrap around */ daemon.vpid = 0; } ret = &daemon; } } found: OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_linear_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(ret))); return *ret; }
static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; opal_list_item_t *item; orte_routed_tree_t *child; int rc; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { ret = ORTE_NAME_INVALID; goto found; } /* if it is me, then the route is just direct */ if (OPAL_EQUAL == opal_dss.compare(ORTE_PROC_MY_NAME, target, ORTE_NAME)) { ret = target; goto found; } /* if I am an application process, always route via my local daemon */ if (ORTE_PROC_IS_APP) { ret = ORTE_PROC_MY_DAEMON; goto found; } /****** HNP AND DAEMONS ONLY ******/ /* if the job family is zero, then this is going to a local slave, * so the path is direct */ if (0 == ORTE_JOB_FAMILY(target->jobid)) { ret = target; goto found; } /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, route this via the HNP */ if (ORTE_PROC_IS_DAEMON) { ret = ORTE_PROC_MY_HNP; goto found; } /* if I am the HNP or a tool, then I stored a route to * this job family, so look it up */ rc = opal_hash_table_get_value_uint32(&jobfam_list, ORTE_JOB_FAMILY(target->jobid), (void**)&ret); if (ORTE_SUCCESS == rc) { /* got a good result - return it */ goto found; } /* not found - so we have no route */ ret = ORTE_NAME_INVALID; goto found; } /* THIS CAME FROM OUR OWN JOB FAMILY... */ /* if we are not using static ports and this is going to the HNP, send direct */ if (!orte_static_ports && ORTE_PROC_MY_HNP->jobid == target->jobid && ORTE_PROC_MY_HNP->vpid == target->vpid) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_output, "%s routing not enabled - going direct", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ret = target; goto found; } daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_ess.proc_get_daemon(target))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ret = ORTE_NAME_INVALID; goto found; } /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; goto found; } else { /* search routing tree for next step to that daemon */ for (item = opal_list_get_first(&my_children); item != opal_list_get_end(&my_children); item = opal_list_get_next(item)) { child = (orte_routed_tree_t*)item; if (child->vpid == daemon.vpid) { /* the child is hosting the proc - just send it there */ ret = &daemon; goto found; } /* otherwise, see if the daemon we need is below the child */ if (opal_bitmap_is_set_bit(&child->relatives, daemon.vpid)) { /* yep - we need to step through this child */ daemon.vpid = child->vpid; ret = &daemon; goto found; } } } /* if we get here, then the target daemon is not beneath * any of our children, so we have to step up through our parent */ daemon.vpid = my_parent.vpid; ret = &daemon; found: OPAL_OUTPUT_VERBOSE((1, orte_routed_base_output, "%s routed_radix_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(ret))); return *ret; }
static int update_route(orte_process_name_t *target, orte_process_name_t *route) { int i; orte_routed_jobfam_t *jfam; uint16_t jfamily; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { return ORTE_ERR_BAD_PARAM; } /* if I am an application process, we don't update the route since * we automatically route everything through the local daemon */ if (ORTE_PROC_IS_APP) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_update: %s --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(route))); /* if I am a daemon and the target is my HNP, then check * the route - if it isn't direct, then we just flag that * we have a route to the HNP */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target) && OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, route)) { hnp_direct = false; return ORTE_SUCCESS; } /* if this is from a different job family, then I need to * track how to send messages to it */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, then I will automatically route * anything to this job family via my HNP - so nothing to do * here, just return */ if (ORTE_PROC_IS_DAEMON) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_update: diff job family routing job %s --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(target->jobid), ORTE_NAME_PRINT(route))); /* see if this target is already present */ jfamily = ORTE_JOB_FAMILY(target->jobid); for (i=0; i < orte_routed_jobfams.size; i++) { if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) { continue; } if (jfam->job_family == jfamily) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_radix: updating route to %s via %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOB_FAMILY_PRINT(target->jobid), ORTE_NAME_PRINT(route))); jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; return ORTE_SUCCESS; } } /* not there, so add the route FOR THE JOB FAMILY*/ OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_radix: adding route to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOB_FAMILY_PRINT(target->jobid))); jfam = OBJ_NEW(orte_routed_jobfam_t); jfam->job_family = jfamily; jfam->route.jobid = route->jobid; jfam->route.vpid = route->vpid; opal_pointer_array_add(&orte_routed_jobfams, jfam); return ORTE_SUCCESS; } return ORTE_SUCCESS; }
static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; orte_routed_jobfam_t *jfam; int i; uint16_t jfamily; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { ret = ORTE_NAME_INVALID; goto found; } /* initialize */ daemon.jobid = ORTE_PROC_MY_DAEMON->jobid; daemon.vpid = ORTE_PROC_MY_DAEMON->vpid; if (ORTE_PROC_IS_APP) { /* if I am an application, AND I have knowledge of * my daemon (i.e., a daemon launched me), then I * always route thru the daemon */ if (NULL != orte_process_info.my_daemon_uri) { ret = ORTE_PROC_MY_DAEMON; } else { /* I was direct launched and do not have * a daemon, so I have to route direct */ ret = target; } goto found; } /* if I am a tool, the route is direct if target is in * my own job family, and to the target's HNP if not */ if (ORTE_PROC_IS_TOOL) { if (ORTE_JOB_FAMILY(target->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { ret = target; goto found; } else { ORTE_HNP_NAME_FROM_JOB(&daemon, target->jobid); ret = &daemon; goto found; } } /****** HNP AND DAEMONS ONLY ******/ /* IF THIS IS FOR A DIFFERENT JOB FAMILY... */ if (ORTE_JOB_FAMILY(target->jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { /* if I am a daemon, route this via the HNP */ if (ORTE_PROC_IS_DAEMON) { ret = ORTE_PROC_MY_HNP; goto found; } /* if I am the HNP, then I stored a route to * this job family, so look it up */ jfamily = ORTE_JOB_FAMILY(target->jobid); for (i=0; i < orte_routed_jobfams.size; i++) { if (NULL == (jfam = (orte_routed_jobfam_t*)opal_pointer_array_get_item(&orte_routed_jobfams, i))) { continue; } if (jfam->job_family == jfamily) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_direct: route to %s found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOB_FAMILY_PRINT(target->jobid))); ret = &jfam->route; goto found; } } /* not found - so we have no route */ ret = ORTE_NAME_INVALID; goto found; } /* THIS CAME FROM OUR OWN JOB FAMILY... */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_HNP, target)) { OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routing direct to the HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ret = ORTE_PROC_MY_HNP; goto found; } daemon.jobid = ORTE_PROC_MY_NAME->jobid; /* find out what daemon hosts this proc */ if (ORTE_VPID_INVALID == (daemon.vpid = orte_get_proc_daemon_vpid(target))) { ret = ORTE_NAME_INVALID; goto found; } /* if the daemon is me, then send direct to the target! */ if (ORTE_PROC_MY_NAME->vpid == daemon.vpid) { ret = target; goto found; } /* else route to this daemon directly */ ret = &daemon; found: OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_direct_get(%s) --> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(ret))); return *ret; }
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) { /* the radix module routes all proc communications through * the local daemon. Daemons must identify which of their * daemon-peers is "hosting" the specified recipient and * route the message to that daemon. Daemon contact info * is handled elsewhere, so all we need to do here is * ensure that the procs are told to route through their * local daemon, and that daemons are told how to route * for each proc */ int rc; /* if I am a tool, then I stand alone - there is nothing to do */ if (ORTE_PROC_IS_TOOL) { return ORTE_SUCCESS; } /* if I am a daemon or HNP, then I have to extract the routing info for this job * from the data sent to me for launch and update the routing tables to * point at the daemon for each proc */ if (ORTE_PROC_IS_DAEMON) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix: init routes for daemon job %s\n\thnp_uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); if (NULL == ndat) { /* indicates this is being called during orte_init. * Get the HNP's name for possible later use */ if (NULL == orte_process_info.my_hnp_uri) { /* fatal error */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* set the contact info into the hash table */ orte_rml.set_contact_info(orte_process_info.my_hnp_uri); /* extract the hnp name and store it */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* if we are using static ports, set my lifeline to point at my parent */ if (orte_static_ports) { lifeline = ORTE_PROC_MY_PARENT; } else { /* set our lifeline to the HNP - we will abort if that connection is lost */ lifeline = ORTE_PROC_MY_HNP; } /* daemons will send their contact info back to the HNP as * part of the message confirming they are read to go. HNP's * load their contact info during orte_init */ } else { /* ndat != NULL means we are getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); } return rc; } OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_radix: completed init routes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } if (ORTE_PROC_IS_HNP) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix: init routes for HNP job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job))); if (NULL == ndat) { /* the HNP has no lifeline */ lifeline = NULL; } else { /* if this is for my own jobid, then I am getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_PROC_MY_NAME->jobid == job) { if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); return rc; } } else { /* if not, then I need to process the callback */ if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) { ORTE_ERROR_LOG(rc); return rc; } } } return ORTE_SUCCESS; } { /* MUST BE A PROC */ /* if ndat != NULL, then this is being invoked by the proc to * init a route to a specified process that is outside of our * job family. We want that route to go through our HNP, routed via * out local daemon - however, we cannot know for * certain that the HNP already knows how to talk to the specified * procs. For example, in OMPI's publish/subscribe procedures, the * DPM framework looks for an mca param containing the global ompi-server's * uri. This info will come here so the proc can setup a route to * the server - we need to pass the routing info to our HNP */ if (NULL != ndat) { int rc; opal_buffer_t *xfer; orte_rml_cmd_flag_t cmd=ORTE_RML_UPDATE_CMD; bool ack_waiting; OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix: init routes w/non-NULL data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) { /* if this is for a different job family, then we route via our HNP * to minimize connection counts to entities such as ompi-server, so * start by sending the contact info to the HNP for update */ OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_init_routes: diff job family - sending update to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); /* prep the buffer for transmission to the HNP */ xfer = OBJ_NEW(opal_buffer_t); opal_dss.pack(xfer, &cmd, 1, ORTE_RML_CMD); opal_dss.copy_payload(xfer, ndat); /* save any new connections for use in subsequent connect_accept calls */ orte_routed_base_update_hnps(ndat); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer, ORTE_RML_TAG_RML_INFO_UPDATE, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(xfer); return rc; } /* wait right here until the HNP acks the update to ensure that * any subsequent messaging can succeed */ ack_waiting = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, &ack_waiting); ORTE_WAIT_FOR_COMPLETION(ack_waiting); OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_init_routes: ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* our get_route function automatically routes all messages for * other job families via the HNP, so nothing more to do here */ } return ORTE_SUCCESS; } /* if ndat=NULL, then we are being called during orte_init. In this * case, we need to setup a few critical pieces of info */ OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri, (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri)); if (NULL == orte_process_info.my_daemon_uri) { /* in this module, we absolutely MUST have this information - if * we didn't get it, then error out */ opal_output(0, "%s ERROR: Failed to identify the local daemon's URI", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: This is a fatal condition when the radix router", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: has been selected - either select the unity router", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: or ensure that the local daemon info is provided", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERR_FATAL; } /* we have to set the HNP's name, even though we won't route messages directly * to it. This is required to ensure that we -do- send messages to the correct * HNP name */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* Set the contact info in the RML - this won't actually establish * the connection, but just tells the RML how to reach the daemon * if/when we attempt to send to it */ orte_rml.set_contact_info(orte_process_info.my_daemon_uri); /* extract the daemon's name so we can update the routing table */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, ORTE_PROC_MY_DAEMON, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* set our lifeline to the local daemon - we will abort if this connection is lost */ lifeline = ORTE_PROC_MY_DAEMON; /* register ourselves -this sends a message to the daemon (warming up that connection) * and sends our contact info to the HNP when all local procs have reported * * NOTE: it may seem odd that we send our contact info to the HNP - after all, * the HNP doesn't really need to know how to talk to us directly if we are * using this routing method. However, this is good for two reasons: * * (1) some debuggers and/or tools may need RML contact * info to set themselves up * * (2) doing so allows the HNP to "block" in a dynamic launch * until all procs are reported running, thus ensuring that no communication * is attempted until the overall ORTE system knows how to talk to everyone - * otherwise, the system can just hang. */ if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(true))) { ORTE_ERROR_LOG(rc); return rc; } /* no answer is expected or coming */ return ORTE_SUCCESS; } }
static int rte_init(void) { int ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; char *rmluri; opal_value_t *kv, kvn; opal_list_t vals; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* we don't have to call pmix.init because the pmix select did it */ /**** THE FOLLOWING ARE REQUIRED VALUES ***/ /* get our jobid from PMI */ if (!opal_pmix.get_attr(PMIX_JOBID, &kv)) { error = "getting jobid"; ret = ORTE_ERR_NOT_FOUND; goto error; } ORTE_PROC_MY_NAME->jobid = kv->data.uint32; OBJ_RELEASE(kv); /* get our global rank from PMI */ if (!opal_pmix.get_attr(PMIX_RANK, &kv)) { error = "getting rank"; ret = ORTE_ERR_NOT_FOUND; goto error; } ORTE_PROC_MY_NAME->vpid = kv->data.uint32; OBJ_RELEASE(kv); /* get our local rank from PMI */ if (!opal_pmix.get_attr(PMIX_LOCAL_RANK, &kv)) { error = "getting local rank"; ret = ORTE_ERR_NOT_FOUND; goto error; } orte_process_info.my_local_rank = (orte_local_rank_t)kv->data.uint16; OBJ_RELEASE(kv); /* get our node rank from PMI */ if (!opal_pmix.get_attr(PMIX_NODE_RANK, &kv)) { error = "getting node rank"; ret = ORTE_ERR_NOT_FOUND; goto error; } orte_process_info.my_node_rank = (orte_local_rank_t)kv->data.uint16; /* get universe size */ if (!opal_pmix.get_attr(PMIX_UNIV_SIZE, &kv)) { error = "getting univ size"; ret = ORTE_ERR_NOT_FOUND; goto error; } orte_process_info.num_procs = kv->data.uint32; OBJ_RELEASE(kv); /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ if (opal_pmix.get_attr(PMIX_APPNUM, &kv)) { orte_process_info.app_num = kv->data.uint32; OBJ_RELEASE(kv); } else { orte_process_info.app_num = 0; } /* get the number of local peers - required for wireup of * shared memory BTL */ if (opal_pmix.get_attr(PMIX_LOCAL_SIZE, &kv)) { orte_process_info.num_local_peers = kv->data.uint32 - 1; // want number besides ourselves OBJ_RELEASE(kv); } else { orte_process_info.num_local_peers = 0; } /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } #if OPAL_HAVE_HWLOC /* if it wasn't passed down to us, get the topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "topology discovery"; goto error; } } #endif /* we don't need to force the routed system to pick the * "direct" component as that should happen automatically * in those cases where we are direct launched (i.e., no * HNP is defined in the environment */ /* now that we have all required info, complete the setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } /* this needs to be set to enable debugger use when direct launched */ if (NULL == orte_process_info.my_daemon_uri) { orte_standalone_operation = true; } /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /*** PUSH DATA FOR OTHERS TO FIND ***/ /* if we are direct launched, then push our RML URI - there * is no need to do so when launched by mpirun as all apps * communicate thru their local daemon */ if (orte_standalone_operation) { OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, OPAL_DSTORE_URI, &vals)) { /* construct the RTE string */ rmluri = orte_rml.get_contact_info(); /* push it out for others to use */ OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_URI); kvn.type = OPAL_STRING; kvn.data.string = strdup(rmluri); if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) { error = "db store uri"; OBJ_DESTRUCT(&kvn); goto error; } OBJ_DESTRUCT(&kvn); free(rmluri); } OPAL_LIST_DESTRUCT(&vals); } /* push our hostname so others can find us, if they need to */ OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_HOSTNAME); kvn.type = OPAL_STRING; kvn.data.string = strdup(orte_process_info.nodename); if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) { error = "db store hostname"; OBJ_DESTRUCT(&kvn); goto error; } OBJ_DESTRUCT(&kvn); /* if our local rank was not provided by the system, then * push our local rank so others can access it */ OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, OPAL_DSTORE_LOCALRANK, &vals)) { OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_LOCALRANK); kvn.type = OPAL_UINT16; kvn.data.uint16 = orte_process_info.my_local_rank; if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) { error = "db store local rank"; OBJ_DESTRUCT(&kvn); goto error; } OBJ_DESTRUCT(&kvn); } OPAL_LIST_DESTRUCT(&vals); /* if our node rank was not provided by the system, then * push our node rank so others can access it */ OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, OPAL_DSTORE_NODERANK, &vals)) { OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_NODERANK); kvn.type = OPAL_UINT16; kvn.data.uint16 = orte_process_info.my_node_rank; if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) { error = "db store node rank"; OBJ_DESTRUCT(&kvn); goto error; } OBJ_DESTRUCT(&kvn); } OPAL_LIST_DESTRUCT(&vals); /* if we are an ORTE app - and not an MPI app - then * we need to exchange our connection info here. * MPI_Init has its own modex, so we don't need to do * two of them. However, if we don't do a modex at all, * then processes have no way to communicate * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { opal_pmix.fence(NULL, 0); } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
int main(int argc, char *argv[]) { int32_t ret, i; opal_cmd_line_t cmd_line; char **inpt; opal_buffer_t *buf; int count; char cwd[OPAL_PATH_MAX]; orcm_tool_cmd_t flag = ORCM_TOOL_STOP_CMD; int32_t master=0; uint16_t jfam=0; /*************** * Initialize ***************/ /* * Make sure to init util before parse_args * to ensure installdirs is setup properly * before calling mca_base_open(); */ if( ORTE_SUCCESS != (ret = orcm_init_util()) ) { return ret; } /* initialize the globals */ my_globals.help = false; my_globals.replicas = NULL; my_globals.sched = NULL; my_globals.hnp_uri = NULL; /* Parse the command line options */ opal_cmd_line_create(&cmd_line, cmd_line_opts); mca_base_open(); mca_base_cmd_line_setup(&cmd_line); ret = opal_cmd_line_parse(&cmd_line, true, argc, argv); /* extract the MCA/GMCA params */ mca_base_cmd_line_process_args(&cmd_line, &environ, &environ); /** * Now start parsing our specific arguments */ if (OPAL_SUCCESS != ret || my_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); orte_show_help("help-orcm-stop.txt", "usage", true, args); free(args); return ORTE_ERROR; } if (NULL != my_globals.sched) { if (0 == strncmp(my_globals.sched, "file", strlen("file")) || 0 == strncmp(my_globals.sched, "FILE", strlen("FILE"))) { char input[1024], *filename; FILE *fp; /* it is a file - get the filename */ filename = strchr(my_globals.sched, ':'); if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-openrcm-runtime.txt", "hnp-filename-bad", true, "scheduler", my_globals.sched); return ORTE_ERROR; } ++filename; /* space past the : */ if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-openrcm-runtime.txt", "hnp-filename-bad", true, "scheduler", my_globals.sched); return ORTE_ERROR; } /* open the file and extract the pid */ fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-openrcm-runtime.txt", "hnp-filename-access", true, "scheduler", filename); return ORTE_ERROR; } if (NULL == fgets(input, 1024, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-openrcm-runtime.txt", "hnp-file-bad", "scheduler", true, filename); return ORTE_ERROR; } fclose(fp); input[strlen(input)-1] = '\0'; /* remove newline */ /* convert the pid */ master = strtoul(input, NULL, 10); } else { /* should just be the master itself */ master = strtoul(my_globals.sched, NULL, 10); } } /* if we were given HNP contact info, parse it and * setup the process_info struct with that info */ if (NULL != my_globals.hnp_uri) { if (0 == strncmp(my_globals.hnp_uri, "file", strlen("file")) || 0 == strncmp(my_globals.hnp_uri, "FILE", strlen("FILE"))) { char input[1024], *filename; FILE *fp; /* it is a file - get the filename */ filename = strchr(my_globals.hnp_uri, ':'); if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-openrcm-runtime.txt", "hnp-filename-bad", true, "uri", my_globals.hnp_uri); goto cleanup; } ++filename; /* space past the : */ if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-openrcm-runtime.txt", "hnp-filename-bad", true, "uri", my_globals.hnp_uri); goto cleanup; } /* open the file and extract the uri */ fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-openrcm-runtime.txt", "hnp-filename-access", true, filename); goto cleanup; } if (NULL == fgets(input, 1024, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-openrcm-runtime.txt", "hnp-file-bad", true, filename); goto cleanup; } fclose(fp); input[strlen(input)-1] = '\0'; /* remove newline */ /* put into the process info struct */ orte_process_info.my_hnp_uri = strdup(input); } else { /* should just be the uri itself */ orte_process_info.my_hnp_uri = strdup(my_globals.hnp_uri); } } if (OPAL_SUCCESS != opal_getcwd(cwd, sizeof(cwd))) { opal_output(orte_clean_output, "failed to get cwd\n"); return ORTE_ERR_NOT_FOUND; } /*************************** * We need all of OPAL and ORTE - this will * automatically connect us to the CM ***************************/ if (ORTE_SUCCESS != orcm_init(ORCM_TOOL)) { orcm_finalize(); return 1; } /* if we were given the hnp uri, extract the job family for the * master id */ if (NULL != my_globals.hnp_uri) { master = ORTE_JOB_FAMILY(ORTE_PROC_MY_HNP->jobid); } /* register to receive responses */ if (ORCM_SUCCESS != (ret = orcm_pnp.register_receive("orcm-stop", "0.1", "alpha", ORCM_PNP_GROUP_INPUT_CHANNEL, ORCM_PNP_TAG_TOOL, ack_recv, NULL))) { ORTE_ERROR_LOG(ret); goto cleanup; } /* announce my existence */ if (ORCM_SUCCESS != (ret = orcm_pnp.announce("orcm-stop", "0.1", "alpha", NULL))) { ORTE_ERROR_LOG(ret); goto cleanup; } /* setup the buffer to send our cmd */ buf = OBJ_NEW(opal_buffer_t); /* indicate the scheduler to be used */ jfam = master & 0x0000ffff; opal_dss.pack(buf, &jfam, 1, OPAL_UINT16); /* get the apps to stop */ inpt = NULL; opal_cmd_line_get_tail(&cmd_line, &count, &inpt); if (0 == count) { /* if no apps were given, then we stop the entire * DVM itself by telling the daemon's to terminate */ if (ORCM_SUCCESS != (ret = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_TERMINATE, NULL, 0, buf, cbfunc, NULL))) { ORTE_ERROR_LOG(ret); } goto cleanup; } else { /* load the stop cmd */ opal_dss.pack(buf, &flag, 1, ORCM_TOOL_CMD_T); /* for each app */ for (i=0; NULL != inpt[i]; i++) { opal_dss.pack(buf, &inpt[i], 1, OPAL_STRING); /* pack the replicas to be stopped */ opal_dss.pack(buf, &my_globals.replicas, 1, OPAL_STRING); } opal_argv_free(inpt); if (ORCM_SUCCESS != (ret = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL, NULL, ORCM_PNP_TAG_TOOL, NULL, 0, buf, cbfunc, NULL))) { ORTE_ERROR_LOG(ret); } } /* now wait for ack */ opal_event_dispatch(opal_event_base); /*************** * Cleanup ***************/ cleanup: orcm_finalize(); return ret; }
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) { int rc; /* if I am a tool, then I stand alone - there is nothing to do */ if (ORTE_PROC_IS_TOOL) { return ORTE_SUCCESS; } /* if I am a daemon or HNP, then I have to extract the routing info for this job * from the data sent to me for launch and update the routing tables to * point at the daemon for each proc */ if (ORTE_PROC_IS_DAEMON) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s direct: init routes for daemon job %s\n\thnp_uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); if (NULL == ndat) { /* indicates this is being called during orte_init. * Get the HNP's name for possible later use */ if (NULL == orte_process_info.my_hnp_uri) { /* fatal error */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* extract the hnp name and store it */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* set the contact info into the hash table */ orte_rml.set_contact_info(orte_process_info.my_hnp_uri); /* the HNP is my lifeline */ lifeline = ORTE_PROC_MY_HNP; /* daemons will send their contact info back to the HNP as * part of the message confirming they are read to go. HNP's * load their contact info during orte_init */ } else { /* ndat != NULL means we are getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); } return rc; } OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_direct: completed init routes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } if (ORTE_PROC_IS_HNP) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct: init routes for HNP job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job))); if (NULL != ndat) { /* if this is for my own jobid, then I am getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_PROC_MY_NAME->jobid == job) { if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); return rc; } } } return ORTE_SUCCESS; } /*** MUST BE A PROC ***/ if (NULL == ndat) { /* if we were direct launched, there is nothing we need to do. If we * were launched by mpirun, then we need to set the HNP and daemon info */ if (NULL != orte_process_info.my_hnp_uri) { /* extract the hnp name and store it */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* we don't set the HNP's contact info as we don't need it - we * only contact our local daemon, which might be the HNP (in which * case it will have also been passed as our daemon uri) */ } if (NULL != orte_process_info.my_daemon_uri) { /* extract the daemon's name so we can update the routing table */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, ORTE_PROC_MY_DAEMON, NULL))) { ORTE_ERROR_LOG(rc); return rc; } orte_rml.set_contact_info(orte_process_info.my_daemon_uri); /* my daemon is my lifeline */ lifeline = ORTE_PROC_MY_DAEMON; } return ORTE_SUCCESS; } /* if ndat != NULL, then this is being invoked by the proc to * init a route to a specified process that is outside of our * job family. We want that route to go through our HNP, routed via * out local daemon - however, we cannot know for * certain that the HNP already knows how to talk to the specified * procs. For example, in OMPI's publish/subscribe procedures, the * DPM framework looks for an mca param containing the global ompi-server's * uri. This info will come here so the proc can setup a route to * the server - we need to pass the routing info to our HNP. * * Obviously, if we were direct launched, we won't have an HNP, in * which case we just update our own contact info and go direct */ if (NULL == orte_process_info.my_hnp_uri) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct: init routes w/non-NULL data and direct launched", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); return rc; } } else { opal_buffer_t *xfer; orte_rml_cmd_flag_t cmd=ORTE_RML_UPDATE_CMD; bool ack_waiting; OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct: init routes w/non-NULL data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) { /* if this is for a different job family, then we route via our HNP * to minimize connection counts to entities such as ompi-server, so * start by sending the contact info to the HNP for update */ OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct_init_routes: diff job family - sending update to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); /* prep the buffer for transmission to the HNP */ xfer = OBJ_NEW(opal_buffer_t); opal_dss.pack(xfer, &cmd, 1, ORTE_RML_CMD); opal_dss.copy_payload(xfer, ndat); /* save any new connections for use in subsequent connect_accept calls */ orte_routed_base_update_hnps(ndat); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer, ORTE_RML_TAG_RML_INFO_UPDATE, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(xfer); return rc; } /* wait right here until the HNP acks the update to ensure that * any subsequent messaging can succeed */ ack_waiting = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, &ack_waiting); ORTE_WAIT_FOR_COMPLETION(ack_waiting); OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct_init_routes: ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* our get_route function automatically routes all messages for * other job families via the HNP, so nothing more to do here */ } } return ORTE_SUCCESS; }