int orte_daemon(int argc, char *argv[]) { int ret = 0; opal_cmd_line_t *cmd_line = NULL; char *rml_uri; int i; opal_buffer_t *buffer; char hostname[100]; #if OPAL_ENABLE_FT_CR == 1 char *tmp_env_var = NULL; #endif /* initialize the globals */ memset(&orted_globals, 0, sizeof(orted_globals)); /* initialize the singleton died pipe to an illegal value so we can detect it was set */ orted_globals.singleton_died_pipe = -1; /* init the failure orted vpid to an invalid value */ orted_globals.fail = ORTE_VPID_INVALID; /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) { OBJ_RELEASE(cmd_line); exit(1); } mca_base_cmd_line_setup(cmd_line); if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, argc, argv))) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args); free(args); OBJ_RELEASE(cmd_line); return ret; } /* * Since this process can now handle MCA/GMCA parameters, make sure to * process them. */ mca_base_cmd_line_process_args(cmd_line, &environ, &environ); /* Ensure that enough of OPAL is setup for us to be able to run */ /* * NOTE: (JJH) * We need to allow 'mca_base_cmd_line_process_args()' to process command * line arguments *before* calling opal_init_util() since the command * line could contain MCA parameters that affect the way opal_init_util() * functions. AMCA parameters are one such option normally received on the * command line that affect the way opal_init_util() behaves. * It is "safe" to call mca_base_cmd_line_process_args() before * opal_init_util() since mca_base_cmd_line_process_args() does *not* * depend upon opal_init_util() functionality. */ if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) { fprintf(stderr, "OPAL failed to initialize -- orted aborting\n"); exit(1); } /* save the environment for launch purposes. This MUST be * done so that we can pass it to any local procs we * spawn - otherwise, those local procs won't see any * non-MCA envars that were set in the enviro when the * orted was executed - e.g., by .csh */ orte_launch_environ = opal_argv_copy(environ); /* purge any ess flag set in the environ when we were launched */ opal_unsetenv("OMPI_MCA_ess", &orte_launch_environ); /* if orte_daemon_debug is set, let someone know we are alive right * away just in case we have a problem along the way */ if (orted_globals.debug) { gethostname(hostname, 100); fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname); } /* check for help request */ if (orted_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); orte_show_help("help-orted.txt", "orted:usage", false, argv[0], args); free(args); return 1; } #if defined(HAVE_SETSID) /* see if we were directed to separate from current session */ if (orted_globals.set_sid) { setsid(); } #endif /* see if they want us to spin until they can connect a debugger to us */ i=0; while (orted_spin_flag) { i++; if (1000 < i) i=0; } #if OPAL_ENABLE_FT_CR == 1 /* Mark as a tool program */ (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var); opal_setenv(tmp_env_var, "1", true, &environ); free(tmp_env_var); #endif /* if mapreduce set, flag it */ if (orted_globals.mapreduce) { orte_map_reduce = true; } /* detach from controlling terminal * otherwise, remain attached so output can get to us */ if(!orte_debug_flag && !orte_debug_daemons_flag && orted_globals.daemonize) { opal_daemon_init(NULL); } /* Set the flag telling OpenRTE that I am NOT a * singleton, but am "infrastructure" - prevents setting * up incorrect infrastructure that only a singleton would * require. */ if (orted_globals.hnp) { if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) { ORTE_ERROR_LOG(ret); return ret; } } else { if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) { ORTE_ERROR_LOG(ret); return ret; } } /* finalize the OPAL utils. As they are opened again from orte_init->opal_init * we continue to have a reference count on them. So we have to finalize them twice... */ opal_finalize_util(); if ((int)ORTE_VPID_INVALID != orted_globals.fail) { orted_globals.abort=false; /* some vpid was ordered to fail. The value can be positive * or negative, depending upon the desired method for failure, * so need to check both here */ if (0 > orted_globals.fail) { orted_globals.fail = -1*orted_globals.fail; orted_globals.abort = true; } /* are we the specified vpid? */ if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) { /* if the user specified we delay, then setup a timer * and have it kill us */ if (0 < orted_globals.fail_delay) { ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI); } else { opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orted_globals.abort ? "abort" : "abnormal termination"); /* do -not- call finalize as this will send a message to the HNP * indicating clean termination! Instead, just forcibly cleanup * the local session_dir tree and exit */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* if we were ordered to abort, do so */ if (orted_globals.abort) { abort(); } /* otherwise, return with non-zero status */ ret = ORTE_ERROR_DEFAULT_EXIT_CODE; goto DONE; } } } /* insert our contact info into our process_info struct so we * have it for later use and set the local daemon field to our name */ orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; /* if I am also the hnp, then update that contact info field too */ if (ORTE_PROC_IS_HNP) { orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; } /* setup the primary daemon command receive function */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); /* output a message indicating we are alive, our name, and our pid * for debugging purposes */ if (orte_debug_daemons_flag) { fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid, orte_process_info.nodename); } /* We actually do *not* want the orted to voluntarily yield() the processor more than necessary. The orted already blocks when it is doing nothing, so it doesn't use any more CPU cycles than it should; but when it *is* doing something, we do not want it to be unnecessarily delayed because it voluntarily yielded the processor in the middle of its work. For example: when a message arrives at the orted, we want the OS to wake up the orted in a timely fashion (which most OS's seem good about doing) and then we want the orted to process the message as fast as possible. If the orted yields and lets aggressive MPI applications get the processor back, it may be a long time before the OS schedules the orted to run again (particularly if there is no IO event to wake it up). Hence, routed OOB messages (for example) may be significantly delayed before being delivered to MPI processes, which can be problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); /* Change the default behavior of libevent such that we want to continually block rather than blocking for the default timeout and then looping around the progress engine again. There should be nothing in the orted that cannot block in libevent until "something" happens (i.e., there's no need to keep cycling through progress because the only things that should happen will happen in libevent). This is a minor optimization, but what the heck... :-) */ opal_progress_set_event_flag(OPAL_EVLOOP_ONCE); /* if requested, report my uri to the indicated pipe */ if (orted_globals.uri_pipe > 0) { orte_job_t *jdata; orte_proc_t *proc; orte_node_t *node; orte_app_context_t *app; char *tmp, *nptr, *sysinfo; int32_t ljob; orte_grpcomm_collective_t *coll; orte_namelist_t *nm; /* setup the singleton's job */ jdata = OBJ_NEW(orte_job_t); orte_plm_base_create_jobid(jdata); ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); /* must create a map for it (even though it has no * info in it) so that the job info will be picked * up in subsequent pidmaps or other daemons won't * know how to route */ jdata->map = OBJ_NEW(orte_job_map_t); /* setup an app_context for the singleton */ app = OBJ_NEW(orte_app_context_t); app->app = strdup("singleton"); app->num_procs = 1; opal_pointer_array_add(jdata->apps, app); /* setup a proc object for the singleton - since we * -must- be the HNP, and therefore we stored our * node on the global node pool, and since the singleton * -must- be on the same node as us, indicate that */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = jdata->jobid; proc->name.vpid = 0; proc->alive = true; proc->state = ORTE_PROC_STATE_RUNNING; proc->app_idx = 0; /* obviously, it is on my node */ node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); proc->node = node; OBJ_RETAIN(node); /* keep accounting straight */ opal_pointer_array_add(jdata->procs, proc); jdata->num_procs = 1; /* and it obviously is on the node */ OBJ_RETAIN(proc); opal_pointer_array_add(node->procs, proc); node->num_procs++; /* and obviously it is one of my local procs */ OBJ_RETAIN(proc); opal_pointer_array_add(orte_local_children, proc); jdata->num_local_procs = 1; /* set the trivial */ proc->local_rank = 0; proc->node_rank = 0; proc->app_rank = 0; proc->state = ORTE_PROC_STATE_RUNNING; proc->alive = true; proc->app_idx = 0; proc->local_proc = true; /* account for the collectives in its modex/barriers */ jdata->peer_modex = orte_grpcomm_base_get_coll_id(); coll = orte_grpcomm_base_setup_collective(jdata->peer_modex); nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = jdata->jobid; nm->name.vpid = ORTE_VPID_WILDCARD; opal_list_append(&coll->participants, &nm->super); jdata->peer_init_barrier = orte_grpcomm_base_get_coll_id(); coll = orte_grpcomm_base_setup_collective(jdata->peer_init_barrier); nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = jdata->jobid; nm->name.vpid = ORTE_VPID_WILDCARD; opal_list_append(&coll->participants, &nm->super); jdata->peer_fini_barrier = orte_grpcomm_base_get_coll_id(); coll = orte_grpcomm_base_setup_collective(jdata->peer_fini_barrier); nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = jdata->jobid; nm->name.vpid = ORTE_VPID_WILDCARD; opal_list_append(&coll->participants, &nm->super); /* need to setup a pidmap for it */ if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(&orte_pidmap, false))) { ORTE_ERROR_LOG(ret); goto DONE; } /* if we don't yet have a daemon map, then we have to generate one * to pass back to it */ if (NULL != orte_nidmap.bytes) { free(orte_nidmap.bytes); } if (ORTE_SUCCESS != (ret = orte_util_encode_nodemap(&orte_nidmap, false))) { ORTE_ERROR_LOG(ret); goto DONE; } /* create a string that contains our uri + sysinfo */ orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model); asprintf(&tmp, "%s[%s]", orte_process_info.my_daemon_uri, sysinfo); free(sysinfo); /* pass that info to the singleton */ write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */ /* cleanup */ free(tmp); /* since a singleton spawned us, we need to harvest * any MCA params from the local environment so * we can pass them along to any subsequent daemons * we may start as the result of a comm_spawn */ for (i=0; NULL != environ[i]; i++) { if (0 == strncmp(environ[i], "OMPI_MCA", 8)) { /* make a copy to manipulate */ tmp = strdup(environ[i]); /* find the equal sign */ nptr = strchr(tmp, '='); *nptr = '\0'; nptr++; /* add the mca param to the orted cmd line */ opal_argv_append_nosize(&orted_cmd_line, "-mca"); opal_argv_append_nosize(&orted_cmd_line, &tmp[9]); opal_argv_append_nosize(&orted_cmd_line, nptr); free(tmp); } } } /* if we were given a pipe to monitor for singleton termination, set that up */ if (orted_globals.singleton_died_pipe > 0) { /* register shutdown handler */ pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t)); opal_event_set(orte_event_base, pipe_handler, orted_globals.singleton_died_pipe, OPAL_EV_READ, pipe_closed, pipe_handler); opal_event_add(pipe_handler, NULL); } /* If I have a parent, then save his contact info so * any messages we send can flow thru him. */ orte_parent_uri = NULL; (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri", "URI for the parent if tree launch is enabled.", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_INTERNAL, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_CONSTANT, &orte_parent_uri); if (NULL != orte_parent_uri) { orte_process_name_t parent; /* set the contact info into the hash table */ orte_rml.set_contact_info(orte_parent_uri); ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL); if (ORTE_SUCCESS != ret) { ORTE_ERROR_LOG(ret); free (orte_parent_uri); orte_parent_uri = NULL; goto DONE; } /* don't need this value anymore */ free(orte_parent_uri); orte_parent_uri = NULL; /* tell the routed module that we have a path * back to the HNP */ if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) { ORTE_ERROR_LOG(ret); goto DONE; } /* set the lifeline to point to our parent so that we * can handle the situation if that lifeline goes away */ if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) { ORTE_ERROR_LOG(ret); goto DONE; } } /* if we are not the HNP...the only time we will be an HNP * is if we are launched by a singleton to provide support * for it */ if (!ORTE_PROC_IS_HNP) { /* send the information to the orted report-back point - this function * will process the data, but also counts the number of * orteds that reported back so the launch procedure can continue. * We need to do this at the last possible second as the HNP * can turn right around and begin issuing orders to us */ buffer = OBJ_NEW(opal_buffer_t); /* insert our name for rollup purposes */ if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } /* for now, always include our contact info, even if we are using * static ports. Eventually, this will be removed */ rml_uri = orte_rml.get_contact_info(); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } /* include our node name */ opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING); /* if requested, include any non-loopback aliases for this node */ if (orte_retain_aliases) { char **aliases=NULL; uint8_t naliases, ni; char hostname[ORTE_MAX_HOSTNAME_SIZE]; /* if we stripped the prefix or removed the fqdn, * include full hostname as an alias */ gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE); if (strlen(orte_process_info.nodename) < strlen(hostname)) { opal_argv_append_nosize(&aliases, hostname); } opal_ifgetaliases(&aliases); naliases = opal_argv_count(aliases); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &naliases, 1, OPAL_UINT8))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } for (ni=0; ni < naliases; ni++) { if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &aliases[ni], 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } opal_argv_free(aliases); } #if OPAL_HAVE_HWLOC { char *coprocessors; /* add the local topology */ if (NULL != opal_hwloc_topology && (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) { if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { ORTE_ERROR_LOG(ret); } } /* detect and add any coprocessors */ coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } /* see if I am on a coprocessor */ coprocessors = opal_hwloc_base_check_on_coprocessor(); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } } #endif /* send to the HNP's callback - will be routed if routes are available */ if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_ORTED_CALLBACK, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } /* if we are tree-spawning, then we need to capture the MCA params * from our cmd line so we can pass them along to the daemons we spawn - * otherwise, only the first layer of daemons will ever see them */ if (orted_globals.tree_spawn) { int j, k; bool ignore; char *no_keep[] = { "orte_hnp_uri", "orte_ess_jobid", "orte_ess_vpid", "orte_ess_num_procs", "orte_parent_uri", NULL }; for (i=0; i < argc; i++) { if (0 == strcmp("-mca", argv[i]) || 0 == strcmp("--mca", argv[i]) ) { ignore = false; /* see if this is something we cannot pass along */ for (k=0; NULL != no_keep[k]; k++) { if (0 == strcmp(no_keep[k], argv[i+1])) { ignore = true; break; } } if (!ignore) { /* see if this is already present so we at least can * avoid growing the cmd line with duplicates */ if (NULL != orted_cmd_line) { for (j=0; NULL != orted_cmd_line[j]; j++) { if (0 == strcmp(argv[i+1], orted_cmd_line[j])) { /* already here - ignore it */ ignore = true; break; } } } if (!ignore) { opal_argv_append_nosize(&orted_cmd_line, argv[i]); opal_argv_append_nosize(&orted_cmd_line, argv[i+1]); opal_argv_append_nosize(&orted_cmd_line, argv[i+2]); } } i += 2; } } } if (orte_debug_daemons_flag) { opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* loop the event lib until an exit event is detected */ while (orte_event_base_active) { opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); } /* ensure all local procs are dead */ orte_odls.kill_local_procs(NULL); DONE: /* update the exit status, in case it wasn't done */ ORTE_UPDATE_EXIT_STATUS(orte_exit_status); /* cleanup and leave */ orte_finalize(); if (orte_debug_flag) { fprintf(stderr, "exiting with status %d\n", orte_exit_status); } exit(orte_exit_status); }
static void _send_notification(int status, orte_proc_state_t state, orte_process_name_t *proc, orte_process_name_t *target) { opal_buffer_t *buf; orte_grpcomm_signature_t sig; int rc; opal_value_t kv, *kvptr; orte_process_name_t daemon; buf = OBJ_NEW(opal_buffer_t); opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:hnp:sending notification %s proc %s target %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(status), ORTE_NAME_PRINT(proc), ORTE_NAME_PRINT(target)); /* pack the status */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &status, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } /* the source is the proc */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } if (OPAL_ERR_PROC_ABORTED == status) { /* we will pass three opal_value_t's */ rc = 3; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } /* pass along the affected proc(s) */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC); kv.type = OPAL_NAME; kv.data.name.jobid = proc->jobid; kv.data.name.vpid = proc->vpid; kvptr = &kv; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); OBJ_RELEASE(buf); return; } OBJ_DESTRUCT(&kv); } else { /* we are going to pass two opal_value_t's */ rc = 2; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } } /* pass along the affected proc(s) */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC); kv.type = OPAL_NAME; kv.data.name.jobid = proc->jobid; kv.data.name.vpid = proc->vpid; kvptr = &kv; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); OBJ_RELEASE(buf); return; } OBJ_DESTRUCT(&kv); /* pass along the proc(s) to be notified */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE); kv.type = OPAL_NAME; kv.data.name.jobid = target->jobid; kv.data.name.vpid = target->vpid; kvptr = &kv; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); OBJ_RELEASE(buf); return; } OBJ_DESTRUCT(&kv); /* if the targets are a wildcard, then xcast it to everyone */ if (ORTE_VPID_WILDCARD == target->vpid) { OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t); sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid; sig.signature[0].vpid = ORTE_VPID_WILDCARD; sig.sz = 1; if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, buf))) { ORTE_ERROR_LOG(rc); } OBJ_DESTRUCT(&sig); OBJ_RELEASE(buf); } else { /* get the daemon hosting the proc to be notified */ daemon.jobid = ORTE_PROC_MY_NAME->jobid; daemon.vpid = orte_get_proc_daemon_vpid(target); /* send the notification to that daemon */ opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:sending notification %s to proc %s at daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(status), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(&daemon)); if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(&daemon, buf, ORTE_RML_TAG_NOTIFICATION, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); } } }
static int mca_oob_ud_process_messages (struct ibv_cq *event_cq, mca_oob_ud_port_t *port) { mca_oob_ud_msg_item_t *msg_item, *next_item; opal_list_t *processing_msgs = &mca_oob_ud_component.ud_event_processing_msgs; mca_oob_ud_peer_t *peer; mca_oob_ud_msg_hdr_t *msg_hdr; int msg_num, i, count; struct ibv_wc wc[40]; bool peer_nacked; count = ibv_poll_cq (event_cq, 40, wc); if (count < 0) return count; /* acknowlege the events */ ibv_ack_cq_events (event_cq, count); for (i = 0 ; i < count ; ++i) { msg_num = (int)(wc[i].wr_id & (~MCA_OOB_UD_RECV_WR)); msg_hdr = (mca_oob_ud_msg_hdr_t *) (port->msg_buf.ptr + msg_num * port->mtu); VALGRIND_MAKE_MEM_DEFINED(msg_hdr, wc[i].byte_len); if (!(wc[i].wr_id & MCA_OOB_UD_RECV_WR) || IBV_WC_SUCCESS != wc[i].status) { mca_oob_ud_port_post_one_recv (port, msg_num); continue; } peer = mca_oob_ud_get_peer (port, &msg_hdr->ra.name, wc[i].src_qp, msg_hdr->ra.qkey, wc[i].slid, msg_hdr->ra.port_num); if (peer) { if (MCA_OOB_UD_MSG_ACK != msg_hdr->msg_type && MCA_OOB_UD_MSG_NACK != msg_hdr->msg_type && MCA_OOB_UD_MSG_END != msg_hdr->msg_type) { mca_oob_ud_msg_item_t *msg_item = OBJ_NEW(mca_oob_ud_msg_item_t); msg_item->msg_num = msg_num; msg_item->hdr = msg_hdr; msg_item->port = port; msg_item->peer = peer; opal_list_append (processing_msgs, (opal_list_item_t *) msg_item); } else { if (MCA_OOB_UD_MSG_ACK == msg_hdr->msg_type) { (void) mca_oob_ud_event_handle_ack (port, peer, msg_hdr); } else if (MCA_OOB_UD_MSG_NACK == msg_hdr->msg_type) { (void) mca_oob_ud_event_handle_nack (port, peer, msg_hdr); } else { mca_oob_ud_event_handle_end (peer, msg_hdr); } mca_oob_ud_port_post_one_recv (port, msg_num); } } else { OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:process_message got a null peer for message id %" PRIu64, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), msg_hdr->msg_id)); mca_oob_ud_port_post_one_recv (port, msg_num); } } /* Sort messages by peer then id */ opal_list_sort (processing_msgs, mca_oob_ud_msg_item_cmp); /* Send ACKs/NACKs and throw away out-of-order messages */ msg_item = (mca_oob_ud_msg_item_t *) mca_oob_ud_list_get_first (processing_msgs); for (peer = NULL, peer_nacked = false ; NULL != msg_item ; msg_item = next_item) { if (peer != msg_item->peer) { peer_nacked = false; } peer = msg_item->peer; next_item = (mca_oob_ud_msg_item_t *) mca_oob_ud_list_get_next (processing_msgs, (opal_list_item_t *)msg_item); if (false == peer_nacked) { if (msg_item->hdr->msg_id > peer->peer_expected_id) { (void) mca_oob_ud_event_send_nack (msg_item->port, peer, msg_item->hdr); peer_nacked = true; } else if (NULL == next_item || (next_item->peer != msg_item->peer)) { (void) mca_oob_ud_event_send_ack (msg_item->port, msg_item->peer, msg_item->hdr); } } if (msg_item->hdr->msg_id != peer->peer_expected_id) { opal_list_remove_item (processing_msgs, (opal_list_item_t *) msg_item); OBJ_RELEASE(msg_item); } else { peer->peer_expected_id++; } } /* Process remaining messages */ while (NULL != (msg_item = (mca_oob_ud_msg_item_t *) opal_list_remove_first (processing_msgs))) { switch (msg_item->hdr->msg_type) { case MCA_OOB_UD_MSG_REQUEST: mca_oob_ud_event_handle_req (port, msg_item->peer, msg_item->hdr); break; case MCA_OOB_UD_MSG_REPLY: mca_oob_ud_event_handle_rep (port, msg_item->hdr); break; case MCA_OOB_UD_MSG_COMPLETE: mca_oob_ud_event_handle_completion (port, msg_item->hdr); break; case MCA_OOB_UD_MSG_DATA_OK: mca_oob_ud_event_handle_data_ok (port, msg_item->hdr); break; case MCA_OOB_UD_MSG_END: mca_oob_ud_event_handle_end (peer, msg_item->hdr); break; default: /* do nothing */ break; } OBJ_RELEASE(msg_item); } return count; }
void orte_iof_hnp_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { orte_process_name_t origin; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; orte_iof_tag_t stream; int32_t count, numbytes; orte_iof_sink_t *sink; opal_list_item_t *item, *next; int rc; /* unpack the stream first as this may be flow control info */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &stream, &count, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } if (ORTE_IOF_XON & stream) { /* re-start the stdin read event */ if (NULL != mca_iof_hnp_component.stdinev && !orte_job_term_ordered && !mca_iof_hnp_component.stdinev->active) { mca_iof_hnp_component.stdinev->active = true; opal_event_add(mca_iof_hnp_component.stdinev->ev, 0); } goto CLEAN_RETURN; } else if (ORTE_IOF_XOFF & stream) { /* stop the stdin read event */ if (NULL != mca_iof_hnp_component.stdinev && !mca_iof_hnp_component.stdinev->active) { opal_event_del(mca_iof_hnp_component.stdinev->ev); mca_iof_hnp_component.stdinev->active = false; } goto CLEAN_RETURN; } /* get name of the process whose io we are discussing */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &origin, &count, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* check to see if a tool has requested something */ if (ORTE_IOF_PULL & stream) { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s received pull cmd from remote tool %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), ORTE_NAME_PRINT(&origin))); /* a tool is requesting that we send it a copy of the specified stream(s) * from the specified process(es), so create a sink for it */ if (ORTE_IOF_STDOUT & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDOUT, NULL, &mca_iof_hnp_component.sinks); sink->daemon.jobid = sender->jobid; sink->daemon.vpid = sender->vpid; } if (ORTE_IOF_STDERR & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDERR, NULL, &mca_iof_hnp_component.sinks); sink->daemon.jobid = sender->jobid; sink->daemon.vpid = sender->vpid; } if (ORTE_IOF_STDDIAG & stream) { ORTE_IOF_SINK_DEFINE(&sink, &origin, -1, ORTE_IOF_STDDIAG, NULL, &mca_iof_hnp_component.sinks); sink->daemon.jobid = sender->jobid; sink->daemon.vpid = sender->vpid; } goto CLEAN_RETURN; } if (ORTE_IOF_CLOSE & stream) { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s received close cmd from remote tool %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender), ORTE_NAME_PRINT(&origin))); /* a tool is requesting that we no longer forward a copy of the * specified stream(s) from the specified process(es) - remove the sink */ item = opal_list_get_first(&mca_iof_hnp_component.sinks); while (item != opal_list_get_end(&mca_iof_hnp_component.sinks)) { next = opal_list_get_next(item); sink = (orte_iof_sink_t*)item; /* if the target isn't set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID == sink->daemon.jobid) { continue; } /* if this sink is the designated one, then remove it from list */ if ((stream & sink->tag) && sink->name.jobid == origin.jobid && (ORTE_VPID_WILDCARD == sink->name.vpid || ORTE_VPID_WILDCARD == origin.vpid || sink->name.vpid == origin.vpid)) { /* send an ack message to the requestor - this ensures that the RML has * completed sending anything to that requestor before it exits */ orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &origin, ORTE_IOF_CLOSE, NULL, 0); opal_list_remove_item(&mca_iof_hnp_component.sinks, item); OBJ_RELEASE(item); } item = next; } goto CLEAN_RETURN; } /* this must have come from a daemon forwarding output - unpack the data */ numbytes=ORTE_IOF_BASE_MSG_MAX; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, data, &numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* numbytes will contain the actual #bytes that were sent */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s unpacked %d bytes from remote proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&origin))); /* output this to our local output */ if (ORTE_IOF_STDOUT & stream || orte_xml_output) { orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stdout->wev); } else { orte_iof_base_write_output(&origin, stream, data, numbytes, orte_iof_base.iof_write_stderr->wev); } /* cycle through the endpoints to see if someone else wants a copy */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { sink = (orte_iof_sink_t*)item; /* if the target isn't set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID == sink->daemon.jobid) { continue; } if ((stream & sink->tag) && sink->name.jobid == origin.jobid && (ORTE_VPID_WILDCARD == sink->name.vpid || ORTE_VPID_WILDCARD == origin.vpid || sink->name.vpid == origin.vpid)) { /* send the data to the tool */ orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &origin, stream, data, numbytes); } } CLEAN_RETURN: return; }
void orte_grpcomm_base_store_modex(opal_buffer_t *rbuf, void *cbdata) { int rc, j, cnt; int32_t num_recvd_entries; orte_process_name_t pname; orte_grpcomm_collective_t *modex = (orte_grpcomm_collective_t*)cbdata; OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s STORING MODEX DATA", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* unpack the process name */ cnt=1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(rbuf, &pname, &cnt, ORTE_NAME))) { /* unpack the number of entries for this proc */ cnt=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &num_recvd_entries, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:update_modex_entries: adding %d entries for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_recvd_entries, ORTE_NAME_PRINT(&pname))); /* * Extract the attribute names and values */ for (j = 0; j < num_recvd_entries; j++) { opal_value_t *kv; cnt = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(rbuf, &kv, &cnt, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((10, orte_grpcomm_base_framework.framework_output, "%s STORING MODEX DATA FROM %s FOR %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname), kv->key)); /* if this is me, dump the data - we already have it in the db */ if (ORTE_PROC_MY_NAME->jobid == pname.jobid && ORTE_PROC_MY_NAME->vpid == pname.vpid) { OBJ_RELEASE(kv); } else { /* store it in the database */ if (ORTE_SUCCESS != (rc = opal_db.store_pointer((opal_identifier_t*)&pname, kv))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* do not release the kv - the db holds that pointer */ } } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex: completed modex entry for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname))); } cleanup: if (NULL == cbdata) { return; } /* cleanup the list, but don't release the * collective object as it was passed into us */ opal_list_remove_item(&orte_grpcomm_base.active_colls, &modex->super); /* notify that the modex is complete */ if (NULL != modex->cbfunc) { OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s CALLING MODEX RELEASE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); modex->cbfunc(NULL, modex->cbdata); } else { OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s store:peer:modex NO MODEX RELEASE CBFUNC", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } /* flag the collective as complete */ modex->active = false; }
static int parse_cli(int argc, int start, char **argv) { int i, j, k; bool ignore; char *no_dups[] = { "grpcomm", "odls", "rml", "routed", NULL }; bool takeus = false; opal_output_verbose(1, orte_schizo_base_framework.framework_output, "%s schizo:ompi: parse_cli", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* if they gave us a list of personalities, * see if we are included */ if (NULL != orte_schizo_base.personalities) { for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } } else { /* attempt to auto-detect CLI options that * we recognize */ } for (i = 0; i < (argc-start); ++i) { if (0 == strcmp("-mca", argv[i]) || 0 == strcmp("--mca", argv[i]) ) { /* ignore this one */ if (0 == strcmp(argv[i+1], "mca_base_env_list")) { i += 2; continue; } /* It would be nice to avoid increasing the length * of the orted cmd line by removing any non-ORTE * params. However, this raises a problem since * there could be OPAL directives that we really * -do- want the orted to see - it's only the OMPI * related directives we could ignore. This becomes * a very complicated procedure, however, since * the OMPI mca params are not cleanly separated - so * filtering them out is nearly impossible. * * see if this is already present so we at least can * avoid growing the cmd line with duplicates */ ignore = false; if (NULL != orted_cmd_line) { for (j=0; NULL != orted_cmd_line[j]; j++) { if (0 == strcmp(argv[i+1], orted_cmd_line[j])) { /* already here - if the value is the same, * we can quitely ignore the fact that they * provide it more than once. However, some * frameworks are known to have problems if the * value is different. We don't have a good way * to know this, but we at least make a crude * attempt here to protect ourselves. */ if (0 == strcmp(argv[i+2], orted_cmd_line[j+1])) { /* values are the same */ ignore = true; break; } else { /* values are different - see if this is a problem */ for (k=0; NULL != no_dups[k]; k++) { if (0 == strcmp(no_dups[k], argv[i+1])) { /* print help message * and abort as we cannot know which one is correct */ orte_show_help("help-orterun.txt", "orterun:conflicting-params", true, orte_basename, argv[i+1], argv[i+2], orted_cmd_line[j+1]); return ORTE_ERR_BAD_PARAM; } } /* this passed muster - just ignore it */ ignore = true; break; } } } } if (!ignore) { opal_argv_append_nosize(&orted_cmd_line, argv[i]); opal_argv_append_nosize(&orted_cmd_line, argv[i+1]); opal_argv_append_nosize(&orted_cmd_line, argv[i+2]); } i += 2; } } return ORTE_SUCCESS; }
static int setup_fork(orte_job_t *jdata, orte_app_context_t *app) { int i; char *param; bool oversubscribed; orte_node_t *node; char **envcpy, **nps, **firstranks; char *npstring, *firstrankstring; char *num_app_ctx; bool takeus = false; opal_output_verbose(1, orte_schizo_base_framework.framework_output, "%s schizo:ompi: setup_fork", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (NULL != orte_schizo_base.personalities) { /* see if we are included */ for (i=0; NULL != jdata->personality[i]; i++) { if (0 == strcmp(jdata->personality[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } } /* see if the mapper thinks we are oversubscribed */ oversubscribed = false; if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_OVERSUBSCRIBED)) { oversubscribed = true; } /* setup base environment: copy the current environ and merge in the app context environ */ if (NULL != app->env) { /* manually free original context->env to avoid a memory leak */ char **tmp = app->env; envcpy = opal_environ_merge(orte_launch_environ, app->env); if (NULL != tmp) { opal_argv_free(tmp); } } else { envcpy = opal_argv_copy(orte_launch_environ); } app->env = envcpy; /* special case handling for --prefix: this is somewhat icky, but at least some users do this. :-\ It is possible that when using --prefix, the user will also "-x PATH" and/or "-x LD_LIBRARY_PATH", which would therefore clobber the work that was done in the prior pls to ensure that we have the prefix at the beginning of the PATH and LD_LIBRARY_PATH. So examine the context->env and see if we find PATH or LD_LIBRARY_PATH. If found, that means the prior work was clobbered, and we need to re-prefix those variables. */ param = NULL; orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)¶m, OPAL_STRING); for (i = 0; NULL != param && NULL != app->env && NULL != app->env[i]; ++i) { char *newenv; /* Reset PATH */ if (0 == strncmp("PATH=", app->env[i], 5)) { asprintf(&newenv, "%s/bin:%s", param, app->env[i] + 5); opal_setenv("PATH", newenv, true, &app->env); free(newenv); } /* Reset LD_LIBRARY_PATH */ else if (0 == strncmp("LD_LIBRARY_PATH=", app->env[i], 16)) { asprintf(&newenv, "%s/lib:%s", param, app->env[i] + 16); opal_setenv("LD_LIBRARY_PATH", newenv, true, &app->env); free(newenv); } } if (NULL != param) { free(param); } /* pass my contact info to the local proc so we can talk */ opal_setenv("OMPI_MCA_orte_local_daemon_uri", orte_process_info.my_daemon_uri, true, &app->env); /* pass the hnp's contact info to the local proc in case it * needs it */ if (NULL != orte_process_info.my_hnp_uri) { opal_setenv("OMPI_MCA_orte_hnp_uri", orte_process_info.my_hnp_uri, true, &app->env); } /* setup yield schedule - do not override any user-supplied directive! */ if (oversubscribed) { opal_setenv("OMPI_MCA_mpi_yield_when_idle", "1", false, &app->env); } else { opal_setenv("OMPI_MCA_mpi_yield_when_idle", "0", false, &app->env); } /* set the app_context number into the environment */ asprintf(¶m, "%ld", (long)app->idx); opal_setenv("OMPI_MCA_orte_app_num", param, true, &app->env); free(param); /* although the total_slots_alloc is the universe size, users * would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. Also required by the ompi_attributes code! * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ asprintf(¶m, "%ld", (long)jdata->total_slots_alloc); opal_setenv("OMPI_UNIVERSE_SIZE", param, true, &app->env); free(param); /* pass the number of nodes involved in this job */ asprintf(¶m, "%ld", (long)(jdata->map->num_nodes)); opal_setenv("OMPI_MCA_orte_num_nodes", param, true, &app->env); free(param); /* pass a param telling the child what type and model of cpu we are on, * if we know it. If hwloc has the value, use what it knows. Otherwise, * see if we were explicitly given it and use that value. */ hwloc_obj_t obj; char *htmp; if (NULL != opal_hwloc_topology) { obj = hwloc_get_root_obj(opal_hwloc_topology); if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUType")) || NULL != (htmp = orte_local_cpu_type)) { opal_setenv("OMPI_MCA_orte_cpu_type", htmp, true, &app->env); } if (NULL != (htmp = (char*)hwloc_obj_get_info_by_name(obj, "CPUModel")) || NULL != (htmp = orte_local_cpu_model)) { opal_setenv("OMPI_MCA_orte_cpu_model", htmp, true, &app->env); } } else { if (NULL != orte_local_cpu_type) { opal_setenv("OMPI_MCA_orte_cpu_type", orte_local_cpu_type, true, &app->env); } if (NULL != orte_local_cpu_model) { opal_setenv("OMPI_MCA_orte_cpu_model", orte_local_cpu_model, true, &app->env); } } /* get shmem's best component name so we can provide a hint to the shmem * framework. the idea here is to have someone figure out what component to * select (via the shmem framework) and then have the rest of the * components in shmem obey that decision. for more details take a look at * the shmem framework in opal. */ if (NULL != (param = opal_shmem_base_best_runnable_component_name())) { opal_setenv("OMPI_MCA_shmem_RUNTIME_QUERY_hint", param, true, &app->env); free(param); } /* Set an info MCA param that tells the launched processes that * any binding policy was applied by us (e.g., so that * MPI_INIT doesn't try to bind itself) */ opal_setenv("OMPI_MCA_orte_bound_at_launch", "1", true, &app->env); /* tell the ESS to avoid the singleton component - but don't override * anything that may have been provided elsewhere */ opal_setenv("OMPI_MCA_ess", "^singleton", false, &app->env); /* ensure that the spawned process ignores direct launch components, * but do not overrride anything we were given */ opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray", false, &app->env); /* since we want to pass the name as separate components, make sure * that the "name" environmental variable is cleared! */ opal_unsetenv("OMPI_MCA_orte_ess_name", &app->env); asprintf(¶m, "%ld", (long)jdata->num_procs); opal_setenv("OMPI_MCA_orte_ess_num_procs", param, true, &app->env); /* although the num_procs is the comm_world size, users * would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ opal_setenv("OMPI_COMM_WORLD_SIZE", param, true, &app->env); free(param); /* users would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ asprintf(¶m, "%ld", (long)jdata->num_local_procs); opal_setenv("OMPI_COMM_WORLD_LOCAL_SIZE", param, true, &app->env); free(param); /* forcibly set the local tmpdir base to match ours */ opal_setenv("OMPI_MCA_orte_tmpdir_base", orte_process_info.tmpdir_base, true, &app->env); /* MPI-3 requires we provide some further info to the procs, * so we pass them as envars to avoid introducing further * ORTE calls in the MPI layer */ asprintf(&num_app_ctx, "%lu", (unsigned long)jdata->num_apps); /* build some common envars we need to pass for MPI-3 compatibility */ nps = NULL; firstranks = NULL; for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } opal_argv_append_nosize(&nps, ORTE_VPID_PRINT(app->num_procs)); opal_argv_append_nosize(&firstranks, ORTE_VPID_PRINT(app->first_rank)); } npstring = opal_argv_join(nps, ' '); firstrankstring = opal_argv_join(firstranks, ' '); opal_argv_free(nps); opal_argv_free(firstranks); /* add the MPI-3 envars */ opal_setenv("OMPI_NUM_APP_CTX", num_app_ctx, true, &app->env); opal_setenv("OMPI_FIRST_RANKS", firstrankstring, true, &app->env); opal_setenv("OMPI_APP_CTX_NUM_PROCS", npstring, true, &app->env); free(num_app_ctx); free(firstrankstring); free(npstring); return ORTE_SUCCESS; }
int orte_util_add_hostfile_nodes(opal_list_t *nodes, char *hostfile) { opal_list_t exclude, adds; opal_list_item_t *item, *itm; int rc; orte_node_t *nd, *node; bool found; OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s hostfile: checking hostfile %s for nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); OBJ_CONSTRUCT(&exclude, opal_list_t); OBJ_CONSTRUCT(&adds, opal_list_t); /* parse the hostfile and add any new contents to the list */ if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &adds, &exclude, false))) { goto cleanup; } /* check for any relative node directives */ for (item = opal_list_get_first(&adds); item != opal_list_get_end(&adds); item = opal_list_get_next(item)) { node=(orte_node_t*)item; if ('+' == node->name[0]) { orte_show_help("help-hostfile.txt", "hostfile:relative-syntax", true, node->name); rc = ORTE_ERR_SILENT; goto cleanup; } } /* remove from the list of nodes those that are in the exclude list */ while (NULL != (item = opal_list_remove_first(&exclude))) { nd = (orte_node_t*)item; /* check for matches on nodes */ for (itm = opal_list_get_first(&adds); itm != opal_list_get_end(&adds); itm = opal_list_get_next(itm)) { node = (orte_node_t*)itm; if (0 == strcmp(nd->name, node->name)) { /* match - remove it */ opal_list_remove_item(&adds, itm); OBJ_RELEASE(itm); break; } } OBJ_RELEASE(item); } /* transfer across all unique nodes */ while (NULL != (item = opal_list_remove_first(&adds))) { nd = (orte_node_t*)item; found = false; for (itm = opal_list_get_first(nodes); itm != opal_list_get_end(nodes); itm = opal_list_get_next(itm)) { node = (orte_node_t*)itm; if (0 == strcmp(nd->name, node->name)) { found = true; break; } } if (!found) { opal_list_append(nodes, &nd->super); OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s hostfile: adding node %s slots %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nd->name, nd->slots)); } else { OBJ_RELEASE(item); } } cleanup: OPAL_LIST_DESTRUCT(&exclude); OPAL_LIST_DESTRUCT(&adds); return rc; }
/* Parse the provided hostfile and filter the nodes that are * on the input list, removing those that * are not found in the hostfile */ int orte_util_filter_hostfile_nodes(opal_list_t *nodes, char *hostfile, bool remove) { opal_list_t newnodes, exclude; opal_list_item_t *item1, *item2, *next, *item3; orte_node_t *node_from_list, *node_from_file, *node_from_pool, *node3; int rc = ORTE_SUCCESS; char *cptr; int num_empty, nodeidx; bool want_all_empty = false; opal_list_t keep; bool found; OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s hostfile: filtering nodes through hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); /* parse the hostfile and create local list of findings */ OBJ_CONSTRUCT(&newnodes, opal_list_t); OBJ_CONSTRUCT(&exclude, opal_list_t); if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &newnodes, &exclude, false))) { OBJ_DESTRUCT(&newnodes); OBJ_DESTRUCT(&exclude); return rc; } /* if the hostfile was empty, then treat it as a no-op filter */ if (0 == opal_list_get_size(&newnodes)) { OBJ_DESTRUCT(&newnodes); OBJ_DESTRUCT(&exclude); /* indicate that the hostfile was empty */ return ORTE_ERR_TAKE_NEXT_OPTION; } /* remove from the list of newnodes those that are in the exclude list * since we could have added duplicate names above due to the */ while (NULL != (item1 = opal_list_remove_first(&exclude))) { node_from_file = (orte_node_t*)item1; /* check for matches on nodes */ for (item2 = opal_list_get_first(&newnodes); item2 != opal_list_get_end(&newnodes); item2 = opal_list_get_next(item2)) { orte_node_t *node = (orte_node_t*)item2; if (0 == strcmp(node_from_file->name, node->name)) { /* match - remove it */ opal_output(0, "HOST %s ON EXCLUDE LIST - REMOVING", node->name); opal_list_remove_item(&newnodes, item2); OBJ_RELEASE(item2); break; } } OBJ_RELEASE(item1); } /* now check our nodes and keep or mark those that match. We can * destruct our hostfile list as we go since this won't be needed */ OBJ_CONSTRUCT(&keep, opal_list_t); while (NULL != (item2 = opal_list_remove_first(&newnodes))) { node_from_file = (orte_node_t*)item2; next = opal_list_get_next(item2); /* see if this is a relative node syntax */ if ('+' == node_from_file->name[0]) { /* see if we specified empty nodes */ if ('e' == node_from_file->name[1] || 'E' == node_from_file->name[1]) { /* request for empty nodes - do they want * all of them? */ if (NULL != (cptr = strchr(node_from_file->name, ':'))) { /* the colon indicates a specific # are requested */ cptr++; /* step past : */ num_empty = strtol(cptr, NULL, 10); } else { /* want them all - set num_empty to max */ num_empty = INT_MAX; want_all_empty = true; } /* search the list of nodes provided to us and find those * that are empty */ item1 = opal_list_get_first(nodes); while (0 < num_empty && item1 != opal_list_get_end(nodes)) { node_from_list = (orte_node_t*)item1; next = opal_list_get_next(item1); /* keep our place */ if (0 == node_from_list->slots_inuse) { /* check to see if this node is explicitly called * out later - if so, don't use it here */ for (item3 = opal_list_get_first(&newnodes); item3 != opal_list_get_end(&newnodes); item3 = opal_list_get_next(item3)) { node3 = (orte_node_t*)item3; if (0 == strcmp(node3->name, node_from_list->name)) { /* match - don't use it */ goto skipnode; } } if (remove) { /* remove item from list */ opal_list_remove_item(nodes, item1); /* xfer to keep list */ opal_list_append(&keep, item1); } else { /* mark as included */ node_from_list->mapped = true; } --num_empty; } skipnode: item1 = next; } /* did they get everything they wanted? */ if (!want_all_empty && 0 < num_empty) { orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty", true, num_empty); rc = ORTE_ERR_SILENT; goto cleanup; } } else if ('n' == node_from_file->name[1] || 'N' == node_from_file->name[1]) { /* they want a specific relative node #, so * look it up on global pool */ nodeidx = strtol(&node_from_file->name[2], NULL, 10); if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeidx))) { /* this is an error */ orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found", true, nodeidx, node_from_file->name); rc = ORTE_ERR_SILENT; goto cleanup; } /* search the list of nodes provided to us and find it */ for (item1 = opal_list_get_first(nodes); item1 != opal_list_get_end(nodes); item1 = opal_list_get_next(nodes)) { node_from_list = (orte_node_t*)item1; if (0 == strcmp(node_from_list->name, node_from_pool->name)) { if (remove) { /* match - remove item from list */ opal_list_remove_item(nodes, item1); /* xfer to keep list */ opal_list_append(&keep, item1); } else { /* mark as included */ node_from_list->mapped = true; } break; } } } else { /* invalid relative node syntax */ orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax", true, node_from_file->name); rc = ORTE_ERR_SILENT; goto cleanup; } } else { /* we are looking for a specific node on the list * search the provided list of nodes to see if this * one is found */ found = false; for (item1 = opal_list_get_first(nodes); item1 != opal_list_get_end(nodes); item1 = opal_list_get_next(item1)) { node_from_list = (orte_node_t*)item1; /* since the name in the hostfile might not match * our local name, and yet still be intended to match, * we have to check for local interfaces */ if (0 == strcmp(node_from_file->name, node_from_list->name) || (0 == strcmp(node_from_file->name, "localhost") && 0 == strcmp(node_from_list->name, orte_process_info.nodename)) || (opal_ifislocal(node_from_list->name) && opal_ifislocal(node_from_file->name))) { /* if the slot count here is less than the * total slots avail on this node, set it * to the specified count - this allows people * to subdivide an allocation */ if (node_from_file->slots < node_from_list->slots) { node_from_list->slots = node_from_file->slots; } if (remove) { /* remove the node from the list */ opal_list_remove_item(nodes, item1); /* xfer it to keep list */ opal_list_append(&keep, item1); } else { /* mark as included */ node_from_list->mapped = true; } found = true; break; } } /* if the host in the newnode list wasn't found, * then that is an error we need to report to the * user and abort */ if (!found) { orte_show_help("help-hostfile.txt", "hostfile:extra-node-not-found", true, hostfile, node_from_file->name); rc = ORTE_ERR_SILENT; goto cleanup; } } /* cleanup the newnode list */ OBJ_RELEASE(item2); } /* if we still have entries on our hostfile list, then * there were requested hosts that were not in our allocation. * This is an error - report it to the user and return an error */ if (0 != opal_list_get_size(&newnodes)) { orte_show_help("help-hostfile.txt", "not-all-mapped-alloc", true, hostfile); while (NULL != (item1 = opal_list_remove_first(&newnodes))) { OBJ_RELEASE(item1); } OBJ_DESTRUCT(&newnodes); return ORTE_ERR_SILENT; } if (!remove) { /* all done */ OBJ_DESTRUCT(&newnodes); return ORTE_SUCCESS; } /* clear the rest of the nodes list */ while (NULL != (item1 = opal_list_remove_first(nodes))) { OBJ_RELEASE(item1); } /* the nodes list has been cleared - rebuild it in order */ while (NULL != (item1 = opal_list_remove_first(&keep))) { opal_list_append(nodes, item1); } cleanup: OBJ_DESTRUCT(&newnodes); return rc; }
int orte_plm_proxy_spawn(orte_job_t *jdata) { opal_buffer_t buf; orte_plm_cmd_flag_t command; orte_std_cntr_t count; orte_process_name_t *target; int rc; OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:proxy spawn child job", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup the buffer */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* tell the HNP we are sending a launch request */ command = ORTE_PLM_LAUNCH_JOB_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* pack the jdata object */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata, 1, ORTE_JOB))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* identify who gets this command - the HNP or the local orted */ if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SPAWN) { /* for now, this is unsupported */ opal_output(0, "LOCAL DAEMON SPAWN IS CURRENTLY UNSUPPORTED"); target = ORTE_PROC_MY_HNP; /* target = ORTE_PROC_MY_DAEMON; */ } else { target = ORTE_PROC_MY_HNP; } OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:proxy sending spawn cmd to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(target))); /* tell the target to launch the job */ if (0 > (rc = orte_rml.send_buffer(target, &buf, ORTE_RML_TAG_PLM, 0))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } OBJ_DESTRUCT(&buf); OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:base:proxy waiting for response", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* wait for the target's response */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (0 > (rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &buf, ORTE_RML_TAG_PLM_PROXY, 0))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* get the new jobid back in case the caller wants it */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(jdata->jobid), &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_JOBID_INVALID == jdata->jobid) { /* something went wrong on far end - go no further */ rc = ORTE_ERR_FAILED_TO_START; goto CLEANUP; } /* good to go! */ CLEANUP: OBJ_DESTRUCT(&buf); return rc; }
static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exclude, bool keep_all) { int rc; orte_node_t* node; bool got_max = false; char* value; char** argv; char* node_name = NULL; char* node_alias = NULL; char* username = NULL; int cnt; int number_of_slots = 0; char buff[64]; if (ORTE_HOSTFILE_STRING == token || ORTE_HOSTFILE_HOSTNAME == token || ORTE_HOSTFILE_INT == token || ORTE_HOSTFILE_IPV4 == token || ORTE_HOSTFILE_IPV6 == token) { if(ORTE_HOSTFILE_INT == token) { snprintf(buff, 64, "%d", orte_util_hostfile_value.ival); value = buff; } else { value = orte_util_hostfile_value.sval; } argv = opal_argv_split (value, '@'); cnt = opal_argv_count (argv); if (1 == cnt) { node_name = strdup(argv[0]); } else if (2 == cnt) { username = strdup(argv[0]); node_name = strdup(argv[1]); } else { opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */ } opal_argv_free (argv); /* if the first letter of the name is '^', then this is a node * to be excluded. Remove the ^ character so the nodename is * usable, and put it on the exclude list */ if ('^' == node_name[0]) { int i, len; len = strlen(node_name); for (i=1; i < len; i++) { node_name[i-1] = node_name[i]; } node_name[len-1] = '\0'; /* truncate */ OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output, "%s hostfile: node %s is being excluded", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name)); /* convert this into something globally unique */ if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) { /* Nodename has been allocated, that is for sure */ if (orte_show_resolved_nodenames && 0 != strcmp(node_name, orte_process_info.nodename)) { node_alias = strdup(node_name); } free (node_name); node_name = strdup(orte_process_info.nodename); } /* Do we need to make a new node object? First check to see if it's already in the exclude list */ if (NULL == (node = hostfile_lookup(exclude, node_name, keep_all))) { node = OBJ_NEW(orte_node_t); node->name = node_name; if (NULL != username) { node->username = strdup(username); } } /* Note that we need to add this back to the exclude list. If it was found, we just removed it (in hostfile_lookup()), so this puts it back. If it was not found, then we have to add it to the exclude list anyway. */ opal_list_append(exclude, &node->super); return ORTE_SUCCESS; } /* this is not a node to be excluded, so we need to process it and * add it to the "include" list. See if this host is actually us. */ if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) { /* Nodename has been allocated, that is for sure */ if (orte_show_resolved_nodenames && 0 != strcmp(node_name, orte_process_info.nodename)) { node_alias = strdup(node_name); } free (node_name); node_name = strdup(orte_process_info.nodename); } OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output, "%s hostfile: node %s is being included - keep all is %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name, keep_all ? "TRUE" : "FALSE")); /* Do we need to make a new node object? */ if (NULL == (node = hostfile_lookup(updates, node_name, keep_all))) { node = OBJ_NEW(orte_node_t); node->name = node_name; node->slots = 1; if (NULL != username) { node->username = strdup(username); } } else { /* this node was already found once - add a slot and mark slots as "given" */ node->slots++; node->slots_given = true; } /* do we need to record an alias for this node? */ if (NULL != node_alias) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&node->alias, node_alias, false); free(node_alias); } } else if (ORTE_HOSTFILE_RELATIVE == token) { /* store this for later processing */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_util_hostfile_value.sval); if (NULL != username) { node->username = strdup(username); } } else if (ORTE_HOSTFILE_RANK == token) { /* we can ignore the rank, but we need to extract the node name. we * first need to shift over to the other side of the equal sign as * this is where the node name will be */ while (!orte_util_hostfile_done && ORTE_HOSTFILE_EQUAL != token) { token = orte_util_hostfile_lex(); } if (orte_util_hostfile_done) { /* bad syntax somewhere */ return ORTE_ERROR; } /* next position should be the node name */ token = orte_util_hostfile_lex(); if(ORTE_HOSTFILE_INT == token) { snprintf(buff, 64, "%d", orte_util_hostfile_value.ival); value = buff; } else { value = orte_util_hostfile_value.sval; } argv = opal_argv_split (value, '@'); cnt = opal_argv_count (argv); if (1 == cnt) { node_name = strdup(argv[0]); } else if (2 == cnt) { username = strdup(argv[0]); node_name = strdup(argv[1]); } else { opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */ } opal_argv_free (argv); /* Do we need to make a new node object? */ if (NULL == (node = hostfile_lookup(updates, node_name, keep_all))) { node = OBJ_NEW(orte_node_t); node->name = node_name; node->slots = 1; if (NULL != username) { node->username = strdup(username); } } else { /* add a slot */ node->slots++; } OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s hostfile: node %s slots %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots)); /* mark the slots as "given" since we take them as being the * number specified via the rankfile */ node->slots_given = true; /* skip to end of line */ while (!orte_util_hostfile_done && ORTE_HOSTFILE_NEWLINE != token) { token = orte_util_hostfile_lex(); } opal_list_append(updates, &node->super); return ORTE_SUCCESS; } else { hostfile_parse_error(token); return ORTE_ERROR; } while (!orte_util_hostfile_done) { token = orte_util_hostfile_lex(); switch (token) { case ORTE_HOSTFILE_DONE: goto done; case ORTE_HOSTFILE_NEWLINE: goto done; case ORTE_HOSTFILE_USERNAME: node->username = hostfile_parse_string(); break; case ORTE_HOSTFILE_COUNT: case ORTE_HOSTFILE_CPU: case ORTE_HOSTFILE_SLOTS: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "slots", true, cur_hostfile_name, rc); OBJ_RELEASE(node); return ORTE_ERROR; } if (node->slots_given) { /* multiple definitions were given for the * slot count - this is not allowed */ orte_show_help("help-hostfile.txt", "slots-given", true, cur_hostfile_name, node->name); OBJ_RELEASE(node); return ORTE_ERROR; } node->slots = rc; node->slots_given = true; /* Ensure that slots_max >= slots */ if (node->slots_max != 0 && node->slots_max < node->slots) { node->slots_max = node->slots; } break; case ORTE_HOSTFILE_SLOTS_MAX: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "max_slots", true, cur_hostfile_name, ((size_t) rc)); OBJ_RELEASE(node); return ORTE_ERROR; } /* Only take this update if it puts us >= node_slots */ if (rc >= node->slots) { if (node->slots_max != rc) { node->slots_max = rc; got_max = true; } } else { orte_show_help("help-hostfile.txt", "max_slots_lt", true, cur_hostfile_name, node->slots, rc); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); OBJ_RELEASE(node); return ORTE_ERROR; } break; default: hostfile_parse_error(token); OBJ_RELEASE(node); return ORTE_ERROR; } if (number_of_slots > node->slots) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); OBJ_RELEASE(node); return ORTE_ERROR; } } done: if (got_max && !node->slots_given) { node->slots = node->slots_max; node->slots_given = true; } opal_list_append(updates, &node->super); return ORTE_SUCCESS; }
int orte_rmaps_base_set_mapping_policy(orte_mapping_policy_t *policy, char **device, char *inspec) { char *ck; char *ptr; orte_mapping_policy_t tmp; int rc; size_t len; char *spec; char *pch; /* set defaults */ tmp = 0; if (NULL != device) { *device = NULL; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base set policy with %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == inspec) ? "NULL" : inspec); if (NULL == inspec) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET); } else { spec = strdup(inspec); // protect the input string /* see if a colon was included - if so, then we have a policy + modifier */ ck = strchr(spec, ':'); if (NULL != ck) { /* if the colon is the first character of the string, then we * just have modifiers on the default mapping policy */ if (ck == spec) { ck++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base only modifiers %s provided - assuming bysocket mapping", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ck); ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET); if (ORTE_ERR_SILENT == (rc = check_modifiers(ck, &tmp)) && ORTE_ERR_BAD_PARAM != rc) { free(spec); return ORTE_ERR_SILENT; } free(spec); goto setpolicy; } /* split the string */ *ck = '\0'; ck++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base policy %s modifiers %s provided", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), spec, ck); /* if the policy is "dist", then we set the policy to that value * and save the second argument as the device */ if (0 == strncasecmp(spec, "ppr", strlen(spec))) { /* we have to allow additional modifiers here - e.g., specifying * #pe's/proc or oversubscribe - so check for modifiers */ if (NULL == (ptr = strrchr(ck, ':'))) { /* this is an error - there had to be at least one * colon to delimit the number from the object type */ orte_show_help("help-orte-rmaps-base.txt", "invalid-pattern", true, inspec); free(spec); return ORTE_ERR_SILENT; } ptr++; // move past the colon /* check the remaining string for modifiers - may be none, so * don't emit an error message if the modifier isn't recognized */ if (ORTE_ERR_SILENT == (rc = check_modifiers(ptr, &tmp)) && ORTE_ERR_BAD_PARAM != rc) { free(spec); return ORTE_ERR_SILENT; } /* if we found something, then we need to adjust the string */ if (ORTE_SUCCESS == rc) { ptr--; *ptr = '\0'; } /* now get the pattern */ orte_rmaps_base.ppr = strdup(ck); ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN); free(spec); goto setpolicy; } if (ORTE_SUCCESS != (rc = check_modifiers(ck, &tmp)) && ORTE_ERR_TAKE_NEXT_OPTION != rc) { if (ORTE_ERR_BAD_PARAM == rc) { orte_show_help("help-orte-rmaps-base.txt", "unrecognized-modifier", true, inspec); } free(spec); return rc; } } len = strlen(spec); if (0 == strncasecmp(spec, "slot", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSLOT); } else if (0 == strncasecmp(spec, "node", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYNODE); } else if (0 == strncasecmp(spec, "seq", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_SEQ); } else if (0 == strncasecmp(spec, "core", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYCORE); } else if (0 == strncasecmp(spec, "l1cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL1CACHE); } else if (0 == strncasecmp(spec, "l2cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL2CACHE); } else if (0 == strncasecmp(spec, "l3cache", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYL3CACHE); } else if (0 == strncasecmp(spec, "socket", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYSOCKET); } else if (0 == strncasecmp(spec, "numa", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYNUMA); } else if (0 == strncasecmp(spec, "board", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYBOARD); } else if (0 == strncasecmp(spec, "hwthread", len)) { ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYHWTHREAD); /* if we are mapping processes to individual hwthreads, then * we need to treat those hwthreads as separate cpus */ opal_hwloc_use_hwthreads_as_cpus = true; } else if ( NULL != device && 0 == strncasecmp(spec, "dist", len)) { if (NULL != rmaps_dist_device) { if (NULL != (pch = strchr(rmaps_dist_device, ':'))) { *pch = '\0'; } *device = strdup(rmaps_dist_device); ORTE_SET_MAPPING_POLICY(tmp, ORTE_MAPPING_BYDIST); } else { orte_show_help("help-orte-rmaps-base.txt", "device-not-specified", true); free(spec); return ORTE_ERR_SILENT; } } else { orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy", true, "mapping", spec); free(spec); return ORTE_ERR_SILENT; } free(spec); ORTE_SET_MAPPING_DIRECTIVE(tmp, ORTE_MAPPING_GIVEN); } setpolicy: *policy = tmp; return ORTE_SUCCESS; }
/** * Function for finding and opening either all MCA components, or the one * that was specifically requested via a MCA parameter. */ static int orte_rmaps_base_open(mca_base_open_flag_t flags) { int rc; /* init the globals */ OBJ_CONSTRUCT(&orte_rmaps_base.selected_modules, opal_list_t); orte_rmaps_base.slot_list = NULL; orte_rmaps_base.mapping = 0; orte_rmaps_base.ranking = 0; /* if a topology file was given, then set our topology * from it. Even though our actual topology may differ, * mpirun only needs to see the compute node topology * for mapping purposes */ if (NULL != rmaps_base_topo_file) { if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(rmaps_base_topo_file))) { orte_show_help("help-orte-rmaps-base.txt", "topo-file", true, rmaps_base_topo_file); return ORTE_ERR_SILENT; } } /* check for violations that has to be detected before we parse the mapping option */ if (NULL != orte_rmaps_base.ppr) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--ppr, -ppr", "--map-by ppr:<pattern>", "rmaps_base_pattern, rmaps_ppr_pattern", "rmaps_base_mapping_policy=ppr:<pattern>"); /* if the mapping policy is NULL, then we can proceed */ if (NULL == rmaps_base_mapping_policy) { asprintf(&rmaps_base_mapping_policy, "ppr:%s", orte_rmaps_base.ppr); } else { return ORTE_ERR_SILENT; } } if (1 < orte_rmaps_base.cpus_per_rank) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--cpus-per-proc, -cpus-per-proc, --cpus-per-rank, -cpus-per-rank", "--map-by <obj>:PE=N, default <obj>=NUMA", "rmaps_base_cpus_per_proc", "rmaps_base_mapping_policy=<obj>:PE=N, default <obj>=NUMA"); } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_mapping_policy(&orte_rmaps_base.mapping, &orte_rmaps_base.device, rmaps_base_mapping_policy))) { return rc; } if (ORTE_SUCCESS != (rc = orte_rmaps_base_set_ranking_policy(&orte_rmaps_base.ranking, orte_rmaps_base.mapping, rmaps_base_ranking_policy))) { return rc; } if (rmaps_base_bycore) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--bycore, -bycore", "--map-by core", "rmaps_base_bycore", "rmaps_base_mapping_policy=core"); /* set mapping policy to bycore - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYCORE) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bycore", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYCORE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bycore - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_CORE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bycore", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_CORE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_byslot) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--byslot, -byslot", "--map-by slot", "rmaps_base_byslot", "rmaps_base_mapping_policy=slot"); /* set mapping policy to byslot - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYSLOT) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "byslot", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYSLOT); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to byslot - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_SLOT) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "byslot", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_SLOT); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (rmaps_base_bynode) { orte_show_help("help-orte-rmaps-base.txt", "deprecated", true, "--bynode, -bynode", "--map-by node", "rmaps_base_bynode", "rmaps_base_mapping_policy=node"); /* set mapping policy to bynode - error if something else already set */ if ((ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) != ORTE_MAPPING_BYNODE) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNODE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* set ranking policy to bynode - error if something else already set */ if ((ORTE_RANKING_GIVEN & ORTE_GET_RANKING_DIRECTIVE(orte_rmaps_base.ranking)) && ORTE_GET_RANKING_POLICY(orte_rmaps_base.ranking) != ORTE_RANK_BY_NODE) { /* error - cannot redefine the default ranking policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "ranking", "bynode", orte_rmaps_base_print_ranking(orte_rmaps_base.ranking)); return ORTE_ERR_SILENT; } ORTE_SET_RANKING_POLICY(orte_rmaps_base.ranking, ORTE_RANK_BY_NODE); ORTE_SET_RANKING_DIRECTIVE(orte_rmaps_base.ranking, ORTE_RANKING_GIVEN); } if (1 < orte_rmaps_base.cpus_per_rank) { /* if we were asked for multiple cpus/proc, then we have to * bind to those cpus - any other binding policy is an * error */ if (OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy)) { if (opal_hwloc_use_hwthreads_as_cpus) { if (OPAL_BIND_TO_HWTHREAD != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, orte_rmaps_base.cpus_per_rank, "use-hwthreads-as-cpus", opal_hwloc_base_print_binding(opal_hwloc_binding_policy), "bind-to hwthread"); return ORTE_ERR_SILENT; } } else if (OPAL_BIND_TO_CORE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { orte_show_help("help-orte-rmaps-base.txt", "mismatch-binding", true, orte_rmaps_base.cpus_per_rank, "cores as cpus", opal_hwloc_base_print_binding(opal_hwloc_binding_policy), "bind-to core"); return ORTE_ERR_SILENT; } } else { if (opal_hwloc_use_hwthreads_as_cpus) { OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_HWTHREAD); } else { OPAL_SET_BINDING_POLICY(opal_hwloc_binding_policy, OPAL_BIND_TO_CORE); } } /* we also need to ensure we are mapping to a high-enough level to have * multiple cpus beneath it - by default, we'll go to the NUMA level */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { if (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYHWTHREAD || (ORTE_GET_MAPPING_POLICY(orte_rmaps_base.mapping) == ORTE_MAPPING_BYCORE && !opal_hwloc_use_hwthreads_as_cpus)) { orte_show_help("help-orte-rmaps-base.txt", "mapping-too-low-init", true); return ORTE_ERR_SILENT; } } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s rmaps:base pe/rank set - setting mapping to BYNUMA", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_BYNUMA); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); } } if (orte_rmaps_base_pernode) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ orte_rmaps_base.ppr = strdup("1:node"); } if (0 < orte_rmaps_base_n_pernode) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ asprintf(&orte_rmaps_base.ppr, "%d:node", orte_rmaps_base_n_pernode); } if (0 < orte_rmaps_base_n_persocket) { /* there is no way to resolve this conflict, so if something else was * given, we have no choice but to error out */ if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "bynode", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } /* ensure we set the mapping policy to ppr */ ORTE_SET_MAPPING_POLICY(orte_rmaps_base.mapping, ORTE_MAPPING_PPR); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_GIVEN); /* define the ppr */ asprintf(&orte_rmaps_base.ppr, "%d:socket", orte_rmaps_base_n_persocket); } /* Should we schedule on the local node or not? */ if (rmaps_base_no_schedule_local) { orte_rmaps_base.mapping |= ORTE_MAPPING_NO_USE_LOCAL; } /* Should we oversubscribe or not? */ if (rmaps_base_no_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && !(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "no-oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); } /** force oversubscription permission */ if (rmaps_base_oversubscribe) { if ((ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { /* error - cannot redefine the default mapping policy */ orte_show_help("help-orte-rmaps-base.txt", "redefining-policy", true, "mapping", "oversubscribe", orte_rmaps_base_print_mapping(orte_rmaps_base.mapping)); return ORTE_ERR_SILENT; } ORTE_UNSET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_SUBSCRIBE_GIVEN); /* also set the overload allowed flag */ opal_hwloc_binding_policy |= OPAL_BIND_ALLOW_OVERLOAD; } /* should we display a detailed (developer-quality) version of the map after determining it? */ if (rmaps_base_display_devel_map) { orte_rmaps_base.display_map = true; orte_devel_level_output = true; } /* should we display a diffable report of proc locations after determining it? */ if (rmaps_base_display_diffable_map) { orte_rmaps_base.display_map = true; orte_display_diffable_output = true; } /* Open up all available components */ rc = mca_base_framework_components_open(&orte_rmaps_base_framework, flags); /* check to see if any component indicated a problem */ if (ORTE_MAPPING_CONFLICTED & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { /* the component would have already reported the error, so * tell the rest of the chain to shut up */ return ORTE_ERR_SILENT; } /* All done */ return rc; }
int orte_ess_base_orted_setup(char **hosts) { int ret = ORTE_ERROR; int fd; char log_file[PATH_MAX]; char *jobidstring; char *error = NULL; char *plm_to_use; orte_job_t *jdata; orte_proc_t *proc; orte_app_context_t *app; orte_node_t *node; #ifndef __WINDOWS__ /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); #endif /* __WINDOWS__ */ signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { error = "topology discovery"; goto error; } } /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So * we remove that information here. */ obj = hwloc_get_root_obj(opal_hwloc_topology); for (i=0; i < obj->infos_count; i++) { if (NULL == obj->infos[i].name || NULL == obj->infos[i].value) { continue; } if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) { free(obj->infos[i].name); free(obj->infos[i].value); /* left justify the array */ for (j=i; j < obj->infos_count-1; j++) { obj->infos[j] = obj->infos[j+1]; } obj->infos[obj->infos_count-1].name = NULL; obj->infos[obj->infos_count-1].value = NULL; obj->infos_count--; break; } } if (4 < opal_output_get_verbosity(orte_ess_base_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } } #endif /* open and setup the opal_pstat framework so we can provide * process stats if requested */ if (ORTE_SUCCESS != (ret = opal_pstat_base_open())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = orte_state_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* some environments allow remote launches - e.g., ssh - so * open and select something -only- if we are given * a specific module to use */ mca_base_param_reg_string_name("plm", NULL, "Which plm component to use (empty = none)", false, false, NULL, &plm_to_use); if (NULL == plm_to_use) { plm_in_use = false; } else { plm_in_use = true; if (ORTE_SUCCESS != (ret = orte_plm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; goto error; } } /* Setup the communication infrastructure */ /* Runtime Messaging Layer - this opens/selects the OOB as well */ if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = orte_db_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_db_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_db_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_db_base_select"; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = orte_odls_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* initialize the nidmaps */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } #if ORTE_ENABLE_STATIC_PORTS /* if we are using static ports, then we need to setup * the daemon info so the RML can function properly * without requiring a wireup stage. This must be done * after we enable_comm as that function determines our * own port, which we need in order to construct the nidmap */ if (orte_static_ports) { /* define the routing tree so we know the pattern * if we are trying to setup common or static ports */ orte_routed.update_routing_plan(); /* extract the node info from the environment and * build a nidmap from it */ if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) { ORTE_ERROR_LOG(ret); error = "construct daemon map from static ports"; goto error; } } #endif /* be sure to update the routing tree so the initial "phone home" * to mpirun goes through the tree if static ports were enabled - still * need to do it anyway just to initialize things */ orte_routed.update_routing_plan(); /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup * as it may involve starting a non-blocking recv * Do this only if a specific PLM was given to us - the * orted has no need of the proxy PLM at all */ if (plm_in_use) { if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } } /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* setup stdout/stderr */ if (orte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ret); error = "convert_jobid"; goto error; } /* define a log file name in the session directory */ snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", jobidstring, orte_process_info.nodename); log_path = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, log_file, NULL); fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so * just connect everything to /dev/null */ fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); } else { dup2(fd, STDOUT_FILENO); dup2(fd, STDERR_FILENO); if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { close(fd); } } } } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); #if OPAL_HAVE_HWLOC /* point our topology to the one detected locally */ node->topology = opal_hwloc_topology; #endif /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the * node object */ OBJ_RETAIN(proc); /* keep accounting straight */ node->daemon = proc; node->daemon_launched = true; node->state = ORTE_NODE_STATE_UP; /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; /* setup the routed info - the selected routed component * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = orte_iof_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = orte_filem_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } /* For daemons, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the SENSOR framework */ if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_sensor_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sensor_select"; goto error; } /* start the local sensors */ orte_sensor.start(ORTE_PROC_MY_NAME->jobid); return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ORTE_ERR_SILENT; }
static void launch_daemons(int fd, short args, void *cbdata) { orte_app_context_t *app; orte_node_t *node; orte_std_cntr_t n; orte_job_map_t *map; char *jobid_string = NULL; char *param; char **argv = NULL; int argc; int rc; char *tmp; char** env = NULL; char *nodelist_flat; char **nodelist_argv; char *name_string; char **custom_strings; int num_args, i; char *cur_prefix; int proc_vpid_index; bool failed_launch=true; orte_job_t *daemons; orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata; OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: LAUNCH DAEMONS CALLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) { state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* start by setting up the virtual machine */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* if we don't want to launch, then don't attempt to * launch the daemons - the user really wants to just * look at the proposed process map */ if (orte_do_not_launch) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* Get the map for this job */ if (NULL == (map = daemons->map)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto cleanup; } if (0 == map->num_new_daemons) { /* set the state to indicate the daemons reported - this * will trigger the daemons_reported event and cause the * job to move to the following step */ OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: no new daemons to launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED); OBJ_RELEASE(state); return; } /* need integer value for command line parameter */ asprintf(&jobid_string, "%lu", (unsigned long) daemons->jobid); /* * start building argv array */ argv = NULL; argc = 0; /* * SLURM srun OPTIONS */ /* add the srun command */ opal_argv_append(&argc, &argv, "srun"); /* start one orted on each node */ opal_argv_append(&argc, &argv, "--ntasks-per-node=1"); /* alert us if any orteds die during startup */ opal_argv_append(&argc, &argv, "--kill-on-bad-exit"); /* ensure the orteds are not bound to a single processor, * just in case the TaskAffinity option is set by default. * This will *not* release the orteds from any cpu-set * constraint, but will ensure it doesn't get * bound to only one processor */ opal_argv_append(&argc, &argv, "--cpu_bind=none"); /* Append user defined arguments to srun */ if ( NULL != mca_plm_slurm_component.custom_args ) { custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' '); num_args = opal_argv_count(custom_strings); for (i = 0; i < num_args; ++i) { opal_argv_append(&argc, &argv, custom_strings[i]); } opal_argv_free(custom_strings); } /* create nodelist */ nodelist_argv = NULL; for (n=0; n < map->nodes->size; n++ ) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { continue; } /* if the daemon already exists on this node, then * don't include it */ if (node->daemon_launched) { continue; } /* otherwise, add it to the list of nodes upon which * we need to launch a daemon */ opal_argv_append_nosize(&nodelist_argv, node->name); } if (0 == opal_argv_count(nodelist_argv)) { orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true); rc = ORTE_ERR_FAILED_TO_START; goto cleanup; } nodelist_flat = opal_argv_join(nodelist_argv, ','); opal_argv_free(nodelist_argv); /* if we are using all allocated nodes, then srun doesn't * require any further arguments */ if (map->num_new_daemons < orte_num_allocated_nodes) { asprintf(&tmp, "--nodes=%lu", (unsigned long)map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); asprintf(&tmp, "--nodelist=%s", nodelist_flat); opal_argv_append(&argc, &argv, tmp); free(tmp); } /* tell srun how many tasks to run */ asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons); opal_argv_append(&argc, &argv, tmp); free(tmp); OPAL_OUTPUT_VERBOSE((2, orte_plm_base_framework.framework_output, "%s plm:slurm: launching on nodes %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat)); /* * ORTED OPTIONS */ /* add the daemon command (as specified by user) */ orte_plm_base_setup_orted_cmd(&argc, &argv); /* Add basic orted command line options, including debug flags */ orte_plm_base_orted_append_basic_args(&argc, &argv, NULL, &proc_vpid_index, nodelist_flat); free(nodelist_flat); /* tell the new daemons the base of the name list so they can compute * their own name on the other end */ rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start); if (ORTE_SUCCESS != rc) { opal_output(0, "plm_slurm: unable to get daemon vpid as string"); goto cleanup; } free(argv[proc_vpid_index]); argv[proc_vpid_index] = strdup(name_string); free(name_string); /* Copy the prefix-directory specified in the corresponding app_context. If there are multiple, different prefix's in the app context, complain (i.e., only allow one --prefix option for the entire slurm run -- we don't support different --prefix'es for different nodes in the SLURM plm) */ cur_prefix = NULL; for (n=0; n < state->jdata->apps->size; n++) { char * app_prefix_dir; if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) { continue; } app_prefix_dir = app->prefix_dir; /* Check for already set cur_prefix_dir -- if different, complain */ if (NULL != app_prefix_dir) { if (NULL != cur_prefix && 0 != strcmp (cur_prefix, app_prefix_dir)) { orte_show_help("help-plm-slurm.txt", "multiple-prefixes", true, cur_prefix, app_prefix_dir); goto cleanup; } /* If not yet set, copy it; iff set, then it's the * same anyway */ if (NULL == cur_prefix) { cur_prefix = strdup(app_prefix_dir); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: Set prefix:%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cur_prefix)); } } } /* setup environment */ env = opal_argv_copy(orte_launch_environ); if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) { param = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: final top-level argv:\n\t%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == param) ? "NULL" : param)); if (NULL != param) free(param); } /* exec the daemon(s) */ if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* indicate that the daemons for this job were launched */ state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED; /* flag that launch was successful, so far as we currently know */ failed_launch = false; cleanup: if (NULL != argv) { opal_argv_free(argv); } if (NULL != env) { opal_argv_free(env); } if(NULL != jobid_string) { free(jobid_string); } /* cleanup the caddy */ OBJ_RELEASE(state); /* check for failed launch - if so, force terminate */ if (failed_launch) { ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } }
int orte_util_get_ordered_host_list(opal_list_t *nodes, char *hostfile) { opal_list_t exclude; opal_list_item_t *item, *itm, *item2, *item1; char *cptr; int num_empty, i, nodeidx, startempty=0; bool want_all_empty=false; orte_node_t *node_from_pool, *newnode; int rc; OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s hostfile: creating ordered list of hosts from hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); OBJ_CONSTRUCT(&exclude, opal_list_t); /* parse the hostfile and add the contents to the list, keeping duplicates */ if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude, true))) { goto cleanup; } /* parse the nodes to process any relative node directives */ item2 = opal_list_get_first(nodes); while (item2 != opal_list_get_end(nodes)) { orte_node_t *node=(orte_node_t*)item2; /* save the next location in case this one gets removed */ item1 = opal_list_get_next(item2); if ('+' != node->name[0]) { item2 = item1; continue; } /* see if we specified empty nodes */ if ('e' == node->name[1] || 'E' == node->name[1]) { /* request for empty nodes - do they want * all of them? */ if (NULL != (cptr = strchr(node->name, ':'))) { /* the colon indicates a specific # are requested */ cptr++; /* step past : */ num_empty = strtol(cptr, NULL, 10); } else { /* want them all - set num_empty to max */ num_empty = INT_MAX; want_all_empty = true; } /* insert empty nodes into newnodes list in place of the current item. * since item1 is the next item, we insert in front of it */ if (!orte_hnp_is_allocated && 0 == startempty) { startempty = 1; } for (i=startempty; 0 < num_empty && i < orte_node_pool->size; i++) { if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (0 == node_from_pool->slots_inuse) { newnode = OBJ_NEW(orte_node_t); newnode->name = strdup(node_from_pool->name); /* if the slot count here is less than the * total slots avail on this node, set it * to the specified count - this allows people * to subdivide an allocation */ if (node->slots < node_from_pool->slots) { newnode->slots = node->slots; } else { newnode->slots = node_from_pool->slots; } opal_list_insert_pos(nodes, item1, &newnode->super); /* track number added */ --num_empty; } } /* bookmark where we stopped in case they ask for more */ startempty = i; /* did they get everything they wanted? */ if (!want_all_empty && 0 < num_empty) { orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty", true, num_empty); rc = ORTE_ERR_SILENT; goto cleanup; } /* since we have expanded the provided node, remove * it from list */ opal_list_remove_item(nodes, item2); OBJ_RELEASE(item2); } else if ('n' == node->name[1] || 'N' == node->name[1]) { /* they want a specific relative node #, so * look it up on global pool */ nodeidx = strtol(&node->name[2], NULL, 10); /* if the HNP is not allocated, then we need to * adjust the index as the node pool is offset * by one */ if (!orte_hnp_is_allocated) { nodeidx++; } /* see if that location is filled */ if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeidx))) { /* this is an error */ orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found", true, nodeidx, node->name); rc = ORTE_ERR_SILENT; goto cleanup; } /* create the node object */ newnode = OBJ_NEW(orte_node_t); newnode->name = strdup(node_from_pool->name); /* if the slot count here is less than the * total slots avail on this node, set it * to the specified count - this allows people * to subdivide an allocation */ if (node->slots < node_from_pool->slots) { newnode->slots = node->slots; } else { newnode->slots = node_from_pool->slots; } /* insert it before item1 */ opal_list_insert_pos(nodes, item1, &newnode->super); /* since we have expanded the provided node, remove * it from list */ opal_list_remove_item(nodes, item2); OBJ_RELEASE(item2); } else { /* invalid relative node syntax */ orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax", true, node->name); rc = ORTE_ERR_SILENT; goto cleanup; } /* move to next */ item2 = item1; } /* remove from the list of nodes those that are in the exclude list */ while(NULL != (item = opal_list_remove_first(&exclude))) { orte_node_t *exnode = (orte_node_t*)item; /* check for matches on nodes */ for (itm = opal_list_get_first(nodes); itm != opal_list_get_end(nodes); itm = opal_list_get_next(itm)) { orte_node_t *node=(orte_node_t*)itm; if (0 == strcmp(exnode->name, node->name)) { /* match - remove it */ opal_list_remove_item(nodes, itm); OBJ_RELEASE(itm); /* have to cycle through the entire list as we could * have duplicates */ } } OBJ_RELEASE(item); } cleanup: OBJ_DESTRUCT(&exclude); return rc; }
static int plm_slurm_start_proc(int argc, char **argv, char **env, char *prefix) { int fd; int srun_pid; char *exec_argv = opal_path_findv(argv[0], 0, env, NULL); if (NULL == exec_argv) { return ORTE_ERR_NOT_FOUND; } srun_pid = fork(); if (-1 == srun_pid) { ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); free(exec_argv); return ORTE_ERR_SYS_LIMITS_CHILDREN; } if (0 == srun_pid) { /* child */ char *bin_base = NULL, *lib_base = NULL; /* Figure out the basenames for the libdir and bindir. There is a lengthy comment about this in plm_rsh_module.c explaining all the rationale for how / why we're doing this. */ lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); /* If we have a prefix, then modify the PATH and LD_LIBRARY_PATH environment variables. */ if (NULL != prefix) { char *oldenv, *newenv; /* Reset PATH */ oldenv = getenv("PATH"); if (NULL != oldenv) { asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv); } else { asprintf(&newenv, "%s/%s", prefix, bin_base); } opal_setenv("PATH", newenv, true, &env); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: reset PATH: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), newenv)); free(newenv); /* Reset LD_LIBRARY_PATH */ oldenv = getenv("LD_LIBRARY_PATH"); if (NULL != oldenv) { asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv); } else { asprintf(&newenv, "%s/%s", prefix, lib_base); } opal_setenv("LD_LIBRARY_PATH", newenv, true, &env); OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, "%s plm:slurm: reset LD_LIBRARY_PATH: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), newenv)); free(newenv); } fd = open("/dev/null", O_CREAT|O_RDWR|O_TRUNC, 0666); if (fd >= 0) { dup2(fd, 0); /* When not in debug mode and --debug-daemons was not passed, * tie stdout/stderr to dev null so we don't see messages from orted * EXCEPT if the user has requested that we leave sessions attached */ if (0 > opal_output_get_verbosity(orte_plm_base_framework.framework_output) && !orte_debug_daemons_flag && !orte_leave_session_attached) { dup2(fd,1); dup2(fd,2); } } /* get the srun process out of orterun's process group so that signals sent from the shell (like those resulting from cntl-c) don't get sent to srun */ setpgid(0, 0); execve(exec_argv, argv, env); opal_output(0, "plm:slurm:start_proc: exec failed"); /* don't return - need to exit - returning would be bad - we're not in the calling process anymore */ exit(1); } else { /* parent */ /* just in case, make sure that the srun process is not in our process group any more. Stevens says always do this on both sides of the fork... */ setpgid(srun_pid, srun_pid); /* if this is the primary launch - i.e., not a comm_spawn of a * child job - then save the pid */ if (!primary_pid_set) { primary_srun_pid = srun_pid; primary_pid_set = true; } /* setup the waitpid so we can find out if srun succeeds! */ orte_wait_cb(srun_pid, srun_wait_cb, NULL); free(exec_argv); } return ORTE_SUCCESS; }
/* * Start monitoring of local processes */ static void start(orte_jobid_t jobid) { orte_job_t *jobdat; orte_app_context_t *app, *aptr; int i; char *filename; file_tracker_t *ft; char *ptr; /* cannot monitor my own job */ if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { return; } OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s starting file monitoring for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid))); /* get the local jobdat for this job */ if (NULL == (jobdat = orte_get_job_data_object(jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return; } /* must be at least one app_context, so use the first one found */ app = NULL; for (i=0; i < jobdat->apps->size; i++) { if (NULL != (aptr = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, i))) { app = aptr; break; } } if (NULL == app) { /* got a problem */ ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return; } /* search the environ to get the filename */ if (!find_value(app, "OMPI_MCA_sensor_file_filename", &filename)) { /* was a default file given */ if (NULL == mca_sensor_file_component.file) { /* can't do anything without a file */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sensor:file no file for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid))); return; } filename = mca_sensor_file_component.file; } /* create the tracking object */ ft = OBJ_NEW(file_tracker_t); ft->jobid = jobid; ft->file = strdup(filename); /* search the environ to see what we are checking */ if (!find_value(app, "OMPI_MCA_sensor_file_check_size", &ptr)) { /* was a default value given */ if (0 < mca_sensor_file_component.check_size) { ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size); } } else { ft->check_size = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10)); free(ptr); } if (!find_value(app, "OMPI_MCA_sensor_file_check_access", &ptr)) { /* was a default value given */ if (0 < mca_sensor_file_component.check_access) { ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access); } } else { ft->check_access = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10)); free(ptr); } if (!find_value(app, "OMPI_MCA_sensor_file_check_mod", &ptr)) { /* was a default value given */ if (0 < mca_sensor_file_component.check_mod) { ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod); } } else { ft->check_mod = OPAL_INT_TO_BOOL(strtol(ptr, NULL, 10)); free(ptr); } if (!find_value(app, "OMPI_MCA_sensor_file_limit", &ptr)) { ft->limit = mca_sensor_file_component.limit; } else { ft->limit = strtol(ptr, NULL, 10); free(ptr); } opal_list_append(&jobs, &ft->super); OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s file %s monitored for %s%s%s with limit %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ft->file, ft->check_size ? "SIZE:" : " ", ft->check_access ? "ACCESS TIME:" : " ", ft->check_mod ? "MOD TIME" : " ", ft->limit)); return; }
static int parse_env(char *path, opal_cmd_line_t *cmd_line, char **srcenv, char ***dstenv) { int i, j; char *param; char *value; char *env_set_flag; char **vars; bool takeus = false; opal_output_verbose(1, orte_schizo_base_framework.framework_output, "%s schizo:ompi: parse_env", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (NULL != orte_schizo_base.personalities) { /* see if we are included */ for (i=0; NULL != orte_schizo_base.personalities[i]; i++) { if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } } for (i = 0; NULL != srcenv[i]; ++i) { if (0 == strncmp("OMPI_", srcenv[i], 5)) { /* check for duplicate in app->env - this * would have been placed there by the * cmd line processor. By convention, we * always let the cmd line override the * environment */ param = strdup(srcenv[i]); value = strchr(param, '='); *value = '\0'; value++; opal_setenv(param, value, false, dstenv); free(param); } } /* set necessary env variables for external usage from tune conf file*/ int set_from_file = 0; vars = NULL; if (OPAL_SUCCESS == mca_base_var_process_env_list_from_file(&vars) && NULL != vars) { for (i=0; NULL != vars[i]; i++) { value = strchr(vars[i], '='); /* terminate the name of the param */ *value = '\0'; /* step over the equals */ value++; /* overwrite any prior entry */ opal_setenv(vars[i], value, true, dstenv); /* save it for any comm_spawn'd apps */ opal_setenv(vars[i], value, true, &orte_forwarded_envars); } set_from_file = 1; opal_argv_free(vars); } /* Did the user request to export any environment variables on the cmd line? */ env_set_flag = getenv("OMPI_MCA_mca_base_env_list"); if (opal_cmd_line_is_taken(cmd_line, "x")) { if (NULL != env_set_flag) { orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false); return ORTE_ERR_FATAL; } j = opal_cmd_line_get_ninsts(cmd_line, "x"); for (i = 0; i < j; ++i) { param = opal_cmd_line_get_param(cmd_line, "x", i, 0); if (NULL != (value = strchr(param, '='))) { /* terminate the name of the param */ *value = '\0'; /* step over the equals */ value++; /* overwrite any prior entry */ opal_setenv(param, value, true, dstenv); /* save it for any comm_spawn'd apps */ opal_setenv(param, value, true, &orte_forwarded_envars); } else { value = getenv(param); if (NULL != value) { /* overwrite any prior entry */ opal_setenv(param, value, true, dstenv); /* save it for any comm_spawn'd apps */ opal_setenv(param, value, true, &orte_forwarded_envars); } else { opal_output(0, "Warning: could not find environment variable \"%s\"\n", param); } } } } else if (NULL != env_set_flag) { /* if mca_base_env_list was set, check if some of env vars were set via -x from a conf file. * If this is the case, error out. */ if (!set_from_file) { /* set necessary env variables for external usage */ vars = NULL; if (OPAL_SUCCESS == mca_base_var_process_env_list(env_set_flag, &vars) && NULL != vars) { for (i=0; NULL != vars[i]; i++) { value = strchr(vars[i], '='); /* terminate the name of the param */ *value = '\0'; /* step over the equals */ value++; /* overwrite any prior entry */ opal_setenv(vars[i], value, true, dstenv); /* save it for any comm_spawn'd apps */ opal_setenv(vars[i], value, true, &orte_forwarded_envars); } opal_argv_free(vars); } } else { orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false); return ORTE_ERR_FATAL; } } /* If the user specified --path, store it in the user's app environment via the OMPI_exec_path variable. */ if (NULL != path) { asprintf(&value, "OMPI_exec_path=%s", path); opal_argv_append_nosize(dstenv, value); /* save it for any comm_spawn'd apps */ opal_argv_append_nosize(&orte_forwarded_envars, value); free(value); } return ORTE_SUCCESS; }
static void file_sample(void) { struct stat buf; opal_list_item_t *item; file_tracker_t *ft; orte_job_t *jdata; OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sampling files", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); for (item = opal_list_get_first(&jobs); item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { ft = (file_tracker_t*)item; /* stat the file and get its size */ if (0 > stat(ft->file, &buf)) { /* cannot stat file */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s could not stat %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ft->file)); continue; } OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s size %lu access %s\tmod %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)buf.st_size, ctime(&buf.st_atime), ctime(&buf.st_mtime))); if (ft->check_size) { if (buf.st_size == ft->file_size) { ft->tick++; goto CHECK; } else { ft->tick = 0; ft->file_size = buf.st_size; } } if (ft->check_access) { if (buf.st_atime == ft->last_access) { ft->tick++; goto CHECK; } else { ft->tick = 0; ft->last_access = buf.st_atime; } } if (ft->check_mod) { if (buf.st_mtime == ft->last_mod) { ft->tick++; goto CHECK; } else { ft->tick = 0; ft->last_mod = buf.st_mtime; } } CHECK: OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sampled file %s tick %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ft->file, ft->tick)); if (ft->tick == ft->limit) { orte_show_help("help-orte-sensor-file.txt", "file-stalled", true, ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod)); jdata = orte_get_job_data_object(ft->jobid); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED); } } }
static int setup_child(orte_job_t *jdata, orte_proc_t *child, orte_app_context_t *app) { char *param, *value; int rc, i; int32_t nrestarts=0, *nrptr; bool takeus = false; opal_output_verbose(1, orte_schizo_base_framework.framework_output, "%s schizo:ompi: setup_child", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (NULL != orte_schizo_base.personalities) { /* see if we are included */ for (i=0; NULL != jdata->personality[i]; i++) { if (0 == strcmp(jdata->personality[i], "ompi")) { takeus = true; break; } } if (!takeus) { return ORTE_ERR_TAKE_NEXT_OPTION; } } /* setup the jobid */ if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) { ORTE_ERROR_LOG(rc); return rc; } opal_setenv("OMPI_MCA_ess_base_jobid", value, true, &app->env); free(value); /* setup the vpid */ if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name.vpid))) { ORTE_ERROR_LOG(rc); return rc; } opal_setenv("OMPI_MCA_ess_base_vpid", value, true, &app->env); /* although the vpid IS the process' rank within the job, users * would appreciate being given a public environmental variable * that also represents this value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ opal_setenv("OMPI_COMM_WORLD_RANK", value, true, &app->env); free(value); /* done with this now */ /* users would appreciate being given a public environmental variable * that also represents the local rank value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ if (ORTE_LOCAL_RANK_INVALID == child->local_rank) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; } asprintf(&value, "%lu", (unsigned long) child->local_rank); opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env); free(value); /* users would appreciate being given a public environmental variable * that also represents the node rank value - something MPI specific - so * do that here. * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ if (ORTE_NODE_RANK_INVALID == child->node_rank) { ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; return rc; } asprintf(&value, "%lu", (unsigned long) child->node_rank); opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env); /* set an mca param for it too */ opal_setenv("OMPI_MCA_orte_ess_node_rank", value, true, &app->env); free(value); /* provide the identifier for the PMIx connection - the * PMIx connection is made prior to setting the process * name itself. Although in most cases the ID and the * process name are the same, it isn't necessarily * required */ orte_util_convert_process_name_to_string(&value, &child->name); opal_setenv("PMIX_ID", value, true, &app->env); free(value); nrptr = &nrestarts; if (orte_get_attribute(&child->attributes, ORTE_PROC_NRESTARTS, (void**)&nrptr, OPAL_INT32)) { /* pass the number of restarts for this proc - will be zero for * an initial start, but procs would like to know if they are being * restarted so they can take appropriate action */ asprintf(&value, "%d", nrestarts); opal_setenv("OMPI_MCA_orte_num_restarts", value, true, &app->env); free(value); } /* if the proc should not barrier in orte_init, tell it */ if (orte_get_attribute(&child->attributes, ORTE_PROC_NOBARRIER, NULL, OPAL_BOOL) || 0 < nrestarts) { opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, &app->env); } /* if we are using staged execution, tell it */ if (orte_staged_execution) { opal_setenv("OMPI_MCA_orte_staged_execution", "1", true, &app->env); } /* if the proc isn't going to forward IO, then we need to flag that * it has "completed" iof termination as otherwise it will never fire */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE); } /* construct the proc's session dir name */ if (NULL != orte_process_info.tmpdir_base) { value = strdup(orte_process_info.tmpdir_base); } else { value = NULL; } param = NULL; if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(¶m, &value, NULL, orte_process_info.nodename, NULL, &child->name))) { ORTE_ERROR_LOG(rc); if (NULL != value) { free(value); } return rc; } free(value); /* pass an envar so the proc can find any files it had prepositioned */ opal_setenv("OMPI_FILE_LOCATION", param, true, &app->env); /* if the user wanted the cwd to be the proc's session dir, then * switch to that location now */ if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) { /* create the session dir - may not exist */ if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(param, S_IRWXU))) { ORTE_ERROR_LOG(rc); /* doesn't exist with correct permissions, and/or we can't * create it - either way, we are done */ free(param); return rc; } /* change to it */ if (0 != chdir(param)) { free(param); return ORTE_ERROR; } /* It seems that chdir doesn't * adjust the $PWD enviro variable when it changes the directory. This * can cause a user to get a different response when doing getcwd vs * looking at the enviro variable. To keep this consistent, we explicitly * ensure that the PWD enviro variable matches the CWD we moved to. * * NOTE: if a user's program does a chdir(), then $PWD will once * again not match getcwd! This is beyond our control - we are only * ensuring they start out matching. */ opal_setenv("PWD", param, true, &app->env); /* update the initial wdir value too */ opal_setenv("OMPI_MCA_initial_wdir", param, true, &app->env); } free(param); return ORTE_SUCCESS; }
/* * A file descriptor is available/ready for send. Check the state * of the socket and take the appropriate action. */ void mca_oob_usock_send_handler(int sd, short flags, void *cbdata) { mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata; mca_oob_usock_send_t* msg = peer->send_msg; int rc; opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s usock:send_handler called to send to peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_USOCK_CONNECTING: case MCA_OOB_USOCK_CLOSED: opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s usock:send_handler %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), mca_oob_usock_state_print(peer->state)); mca_oob_usock_peer_complete_connect(peer); /* de-activate the send event until the connection * handshake completes */ if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; case MCA_OOB_USOCK_CONNECTED: opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s usock:send_handler SENDING TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name)); if (NULL != msg) { /* if the header hasn't been completely sent, send it */ if (!msg->hdr_sent) { if (ORTE_SUCCESS == (rc = send_bytes(peer))) { /* header is completely sent */ msg->hdr_sent = true; /* setup to send the data */ if (NULL == msg->msg) { /* this was a zero-byte msg - nothing more to do */ OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; } else if (NULL != msg->msg->buffer) { /* send the buffer data as a single block */ msg->sdptr = msg->msg->buffer->base_ptr; msg->sdbytes = msg->msg->buffer->bytes_used; } else if (NULL != msg->msg->iov) { /* start with the first iovec */ msg->sdptr = msg->msg->iov[0].iov_base; msg->sdbytes = msg->msg->iov[0].iov_len; msg->iovnum = 0; } else { msg->sdptr = msg->msg->data; msg->sdbytes = msg->msg->count; } /* fall thru and let the send progress */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: unable to send header", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); opal_event_del(&peer->send_event); peer->send_ev_active = false; msg->msg->status = rc; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; goto next; } } /* progress the data transmission */ if (msg->hdr_sent) { if (ORTE_SUCCESS == (rc = send_bytes(peer))) { /* this block is complete */ if (NULL != msg->msg->buffer) { /* we are done - notify the RML */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), msg->hdr.nbytes, peer->sd); msg->msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; } else if (NULL != msg->msg->data) { /* this was a relay message - nothing more to do */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), msg->hdr.nbytes, peer->sd); OBJ_RELEASE(msg); peer->send_msg = NULL; } else { /* rotate to the next iovec */ msg->iovnum++; if (msg->iovnum < msg->msg->count) { msg->sdptr = msg->msg->iov[msg->iovnum].iov_base; msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len; /* exit this event to give the event lib * a chance to progress any other pending * actions */ return; } else { /* this message is complete - notify the RML */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), msg->hdr.nbytes, peer->sd); msg->msg->status = ORTE_SUCCESS; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; } } /* fall thru to queue the next message */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: unable to send message ON SOCKET %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->sd); opal_event_del(&peer->send_event); peer->send_ev_active = false; msg->msg->status = rc; ORTE_RML_SEND_COMPLETE(msg->msg); OBJ_RELEASE(msg); peer->send_msg = NULL; ORTE_FORCED_TERMINATE(1); return; } } next: /* if current message completed - progress any pending sends by * moving the next in the queue into the "on-deck" position. Note * that this doesn't mean we send the message right now - we will * wait for another send_event to fire before doing so. This gives * us a chance to service any pending recvs. */ peer->send_msg = (mca_oob_usock_send_t*) opal_list_remove_first(&peer->send_queue); } /* if nothing else to do unregister for send event notifications */ if (NULL == peer->send_msg && peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; default: opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: invalid connection state (%d) on socket %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state, peer->sd); if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } break; } }
static void sstore_central_global_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int ret; orte_sstore_central_cmd_flag_t command; orte_std_cntr_t count; orte_sstore_base_handle_t loc_id; orte_sstore_central_global_snapshot_info_t *handle_info = NULL; if( ORTE_RML_TAG_SSTORE_INTERNAL != tag ) { return; } /* * If this was an application process contacting us, then act like an orted * instead of an HNP */ if(OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, sender)) { orte_sstore_central_local_recv(status, sender, buffer, tag, cbdata); return; } OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle, "sstore:central:(global): process_cmd(%s)", ORTE_NAME_PRINT(sender))); count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SSTORE_CENTRAL_CMD))) { ORTE_ERROR_LOG(ret); goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &loc_id, &count, ORTE_SSTORE_HANDLE )) ) { ORTE_ERROR_LOG(ret); goto cleanup; } /* * Find the referenced handle */ if(NULL == (handle_info = find_handle_info(loc_id)) ) { ; /* JJH big problem */ } /* * Process the command */ if( ORTE_SSTORE_CENTRAL_PULL == command ) { process_local_pull(sender, buffer, handle_info); } else if( ORTE_SSTORE_CENTRAL_PUSH == command ) { process_local_push(sender, buffer, handle_info); } cleanup: return; }
static int read_bytes(mca_oob_usock_peer_t* peer) { int rc; /* read until all bytes recvd or error */ while (0 < peer->recv_msg->rdbytes) { rc = read(peer->sd, peer->recv_msg->rdptr, peer->recv_msg->rdbytes); if (rc < 0) { if(opal_socket_errno == EINTR) { continue; } else if (opal_socket_errno == EAGAIN) { /* tell the caller to keep this message on active, * but let the event lib cycle so other messages * can progress while this socket is busy */ return ORTE_ERR_RESOURCE_BUSY; } else if (opal_socket_errno == EWOULDBLOCK) { /* tell the caller to keep this message on active, * but let the event lib cycle so other messages * can progress while this socket is busy */ return ORTE_ERR_WOULD_BLOCK; } /* we hit an error and cannot progress this message - report * the error back to the RML and let the caller know * to abort this message */ opal_output_verbose(OOB_USOCK_DEBUG_FAIL, orte_oob_base_framework.framework_output, "%s-%s mca_oob_usock_msg_recv: readv failed: %s (%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), strerror(opal_socket_errno), opal_socket_errno); // mca_oob_usock_peer_close(peer); // if (NULL != mca_oob_usock.oob_exception_callback) { // mca_oob_usock.oob_exception_callback(&peer->name, ORTE_RML_PEER_DISCONNECTED); //} return ORTE_ERR_COMM_FAILURE; } else if (rc == 0) { /* the remote peer closed the connection - report that condition * and let the caller know */ opal_output_verbose(OOB_USOCK_DEBUG_FAIL, orte_oob_base_framework.framework_output, "%s-%s mca_oob_usock_msg_recv: peer closed connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* stop all events */ if (peer->recv_ev_active) { opal_event_del(&peer->recv_event); peer->recv_ev_active = false; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } if (peer->send_ev_active) { opal_event_del(&peer->send_event); peer->send_ev_active = false; } if (NULL != peer->recv_msg) { OBJ_RELEASE(peer->recv_msg); peer->recv_msg = NULL; } mca_oob_usock_peer_close(peer); //if (NULL != mca_oob_usock.oob_exception_callback) { // mca_oob_usock.oob_exception_callback(&peer->peer_name, ORTE_RML_PEER_DISCONNECTED); //} return ORTE_ERR_WOULD_BLOCK; } /* we were able to read something, so adjust counters and location */ peer->recv_msg->rdbytes -= rc; peer->recv_msg->rdptr += rc; } /* we read the full data block */ return ORTE_SUCCESS; }
/*************** MODEX SECTION **************/ void orte_grpcomm_base_modex(int fd, short args, void *cbdata) { orte_grpcomm_caddy_t *caddy = (orte_grpcomm_caddy_t*)cbdata; orte_grpcomm_collective_t *modex = caddy->op; int rc; orte_namelist_t *nm; opal_list_item_t *item; bool found; orte_grpcomm_collective_t *cptr; opal_scope_t scope; OBJ_RELEASE(caddy); OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:modex: performing modex", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are a singleton and routing isn't enabled, * then we have nobody with which to communicate, so * we can just declare success */ if ((orte_process_info.proc_type & ORTE_PROC_SINGLETON) && !orte_routing_is_enabled) { if (NULL != modex->cbfunc) { OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s CALLING MODEX RELEASE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); modex->cbfunc(NULL, modex->cbdata); } /* flag the collective as complete */ modex->active = false; return; } if (0 == opal_list_get_size(&modex->participants)) { /* record the collective */ modex->next_cbdata = modex; opal_list_append(&orte_grpcomm_base.active_colls, &modex->super); /* put our process name in the buffer so it can be unpacked later */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* this is between our peers, so only collect info marked for them */ scope = OPAL_SCOPE_PEER; /* add a wildcard name to the participants so the daemon knows * the jobid that is involved in this collective */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = ORTE_PROC_MY_NAME->jobid; nm->name.vpid = ORTE_VPID_WILDCARD; opal_list_append(&modex->participants, &nm->super); modex->next_cb = orte_grpcomm_base_store_modex; } else { /* see if the collective is already present - a race condition * exists where other participants may have already sent us their * contribution. This would place the collective on the global * array, but leave it marked as "inactive" until we call * modex with the list of participants */ found = false; for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); item != opal_list_get_end(&orte_grpcomm_base.active_colls); item = opal_list_get_next(item)) { cptr = (orte_grpcomm_collective_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s CHECKING COLL id %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cptr->id)); if (modex->id == cptr->id) { found = true; /* remove the old entry - we will replace it * with the modex one */ opal_list_remove_item(&orte_grpcomm_base.active_colls, item); break; } } if (found) { /* since it already exists, the list of * targets contains the list of procs * that have already sent us their info. Cycle * thru the targets and move those entries to * the modex object */ while (NULL != (item = opal_list_remove_first(&cptr->targets))) { opal_list_append(&modex->targets, item); } /* copy the previously-saved data across */ opal_dss.copy_payload(&modex->local_bucket, &cptr->local_bucket); /* cleanup */ OBJ_RELEASE(cptr); } /* now add the modex to the global list of active collectives */ modex->next_cb = orte_grpcomm_base_store_modex; modex->next_cbdata = modex; opal_list_append(&orte_grpcomm_base.active_colls, &modex->super); /* pack the collective id */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, &modex->id, 1, ORTE_GRPCOMM_COLL_ID_T))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack our name */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&modex->buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* this is not amongst our peers, but rather between a select * group of processes - e.g., during a connect/accept operation. * Thus, we need to include the non-peer info as well as our peers * since we can't tell what the other participants may already have */ scope = OPAL_SCOPE_GLOBAL; } /* pack the requested entries */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&modex->buffer, scope))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:full:modex: executing allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* execute the allgather */ if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather(modex))) { ORTE_ERROR_LOG(rc); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:modex: modex posted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; cleanup: return; }
void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata) { mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata; int rc; orte_rml_send_t *snd; if (orte_abnormal_term_ordered) { return; } opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_USOCK_CONNECT_ACK: if (ORTE_SUCCESS == (rc = mca_oob_usock_peer_recv_connect_ack(peer, peer->sd, NULL))) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler starting send/recv events", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* we connected! Start the send/recv events */ if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); peer->recv_ev_active = true; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } /* if there is a message waiting to be sent, queue it */ if (NULL == peer->send_msg) { peer->send_msg = (mca_oob_usock_send_t*)opal_list_remove_first(&peer->send_queue); } if (NULL != peer->send_msg && !peer->send_ev_active) { opal_event_add(&peer->send_event, 0); peer->send_ev_active = true; } /* update our state */ peer->state = MCA_OOB_USOCK_CONNECTED; } else { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); opal_event_del(&peer->recv_event); peer->recv_ev_active = false; ORTE_FORCED_TERMINATE(1); return; } break; case MCA_OOB_USOCK_CONNECTED: opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler CONNECTED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* allocate a new message and setup for recv */ if (NULL == peer->recv_msg) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler allocate new recv msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); peer->recv_msg = OBJ_NEW(mca_oob_usock_recv_t); if (NULL == peer->recv_msg) { opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to allocate recv message\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); return; } /* start by reading the header */ peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr; peer->recv_msg->rdbytes = sizeof(mca_oob_usock_hdr_t); } /* if the header hasn't been completely read, read it */ if (!peer->recv_msg->hdr_recvd) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler read hdr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (ORTE_SUCCESS == (rc = read_bytes(peer))) { /* completed reading the header */ peer->recv_msg->hdr_recvd = true; /* if this is a zero-byte message, then we are done */ if (0 == peer->recv_msg->hdr.nbytes) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); peer->recv_msg->data = NULL; // make sure peer->recv_msg->rdptr = NULL; peer->recv_msg->rdbytes = 0; } else { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler allocate data region of size %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes); /* allocate the data region */ peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes); /* point to it */ peer->recv_msg->rdptr = peer->recv_msg->data; peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes; } /* fall thru and attempt to read the data */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { /* close the connection */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler error reading bytes - closing connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_usock_peer_close(peer); return; } } if (peer->recv_msg->hdr_recvd) { /* continue to read the data block - we start from * wherever we left off, which could be at the * beginning or somewhere in the message */ if (ORTE_SUCCESS == (rc = read_bytes(peer))) { /* we recvd all of the message */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD COMPLETE MESSAGE FROM %s OF %d BYTES FOR DEST %s TAG %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin), (int)peer->recv_msg->hdr.nbytes, ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), peer->recv_msg->hdr.tag); /* am I the intended recipient? */ if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, peer->recv_msg->hdr.seq_num, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); } else { /* no - we don't route things, so we promote this * back to the OOB and let another transport move * it along. If we are a daemon and it is intended * for another of our local procs, it will just come * back to us and be handled then */ snd = OBJ_NEW(orte_rml_send_t); snd->dst = peer->recv_msg->hdr.dst; snd->origin = peer->recv_msg->hdr.origin; snd->tag = peer->recv_msg->hdr.tag; snd->data = peer->recv_msg->data; snd->seq_num = peer->recv_msg->hdr.seq_num; snd->count = peer->recv_msg->hdr.nbytes; snd->cbfunc.iov = NULL; snd->cbdata = NULL; /* activate the OOB send state */ ORTE_OOB_SEND(snd); /* protect the data */ peer->recv_msg->data = NULL; /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to recv message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* turn off the recv event */ opal_event_del(&peer->recv_event); peer->recv_ev_active = false; ORTE_FORCED_TERMINATE(1); return; } } break; default: opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); // mca_oob_usock_peer_close(peer); break; } }
static int orcmd_init(void) { int ret = ORTE_ERROR; char *error = NULL; opal_buffer_t buf, *clusterbuf, *uribuf; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; opal_list_t config; orcm_scheduler_t *scheduler; orcm_node_t *mynode=NULL; int32_t n; if (initialized) { return ORCM_SUCCESS; } initialized = true; /* Initialize the ORTE data type support */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_std_prolog"; goto error; } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* create a job tracker for the daemons */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = 0; ORTE_PROC_MY_NAME->jobid = 0; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* read the site configuration */ OBJ_CONSTRUCT(&config, opal_list_t); if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) { error = "getting config"; goto error; } /* define the cluster and collect contact info for all * aggregators - we'll need to know how to talk to any * of them in case of failures */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config, &mynode, &orte_process_info.num_procs, &buf))) { OBJ_DESTRUCT(&buf); error = "define system"; goto error; } /* if my name didn't get set, then we didn't find our node * in the config - report it and die */ if (NULL == mynode) { orte_show_help("help-ess-orcm.txt", "node-not-found", true, orcm_cfgi_base.config_file, orte_process_info.nodename); OBJ_DESTRUCT(&buf); return ORTE_ERR_SILENT; } /* define a node and proc object for ourselves as some parts * of ORTE and ORCM require it */ if (NULL == (node = OBJ_NEW(orte_node_t))) { ret = ORTE_ERR_OUT_OF_RESOURCE; error = "out of memory"; goto error; } node->name = strdup(orte_process_info.nodename); opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); if (NULL == (proc = OBJ_NEW(orte_proc_t))) { ret = ORTE_ERR_OUT_OF_RESOURCE; error = "out of memory"; goto error; } proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; OBJ_RETAIN(proc); node->daemon = proc; OBJ_RETAIN(node); proc->node = node; opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc); /* For now, we only support a single scheduler daemon in the system. * This *may* change someday in the future */ scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers); /* If we are in test mode, then we don't *require* that a scheduler * be defined in the system - otherwise, we do */ if (NULL == scheduler) { if (mca_sst_orcmd_component.scheduler_reqd) { error = "no scheduler found"; ret = ORTE_ERR_NOT_FOUND; goto error; } } else { ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid; ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid; } /* register the ORTE-level params at this time now that the * config has had a chance to push things into the environ */ if (ORTE_SUCCESS != (ret = orte_register_params())) { OBJ_DESTRUCT(&buf); error = "orte_register_params"; goto error; } /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { OBJ_DESTRUCT(&buf); error = "topology discovery"; goto error; } } /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So * we remove that information here. */ obj = hwloc_get_root_obj(opal_hwloc_topology); for (i=0; i < obj->infos_count; i++) { if (NULL == obj->infos[i].name || NULL == obj->infos[i].value) { continue; } if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) { free(obj->infos[i].name); free(obj->infos[i].value); /* left justify the array */ for (j=i; j < obj->infos_count-1; j++) { obj->infos[j] = obj->infos[j+1]; } obj->infos[obj->infos_count-1].name = NULL; obj->infos[obj->infos_count-1].value = NULL; obj->infos_count--; break; } } if (15 < opal_output_get_verbosity(orcm_sst_base_framework.framework_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } /* if we were asked to bind to specific core(s), do so now */ if (NULL != orte_daemon_cores) { char **cores=NULL, tmp[128]; hwloc_obj_t pu; hwloc_cpuset_t ours, pucpus, res; int core; /* could be a collection of comma-delimited ranges, so * use our handy utility to parse it */ orte_util_parse_range_options(orte_daemon_cores, &cores); if (NULL != cores) { ours = hwloc_bitmap_alloc(); hwloc_bitmap_zero(ours); pucpus = hwloc_bitmap_alloc(); res = hwloc_bitmap_alloc(); for (i=0; NULL != cores[i]; i++) { core = strtoul(cores[i], NULL, 10); if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) { orte_show_help("help-orted.txt", "orted:cannot-bind", true, orte_process_info.nodename, orte_daemon_cores); ret = ORTE_ERR_NOT_SUPPORTED; OBJ_DESTRUCT(&buf); error = "cannot bind"; goto error; } hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset); hwloc_bitmap_or(res, ours, pucpus); hwloc_bitmap_copy(ours, res); } /* if the result is all zeros, then don't bind */ if (!hwloc_bitmap_iszero(ours)) { (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0); if (opal_hwloc_report_bindings) { opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours); opal_output(0, "Daemon %s is bound to cores %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); } } /* cleanup */ hwloc_bitmap_free(ours); hwloc_bitmap_free(pucpus); hwloc_bitmap_free(res); opal_argv_free(cores); } } } #endif /* open and select the pstat framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_state_base_select"; goto error; } /* open the notifier */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_notifier_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_notifier_base_open"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_errmgr_base_open"; goto error; } /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } if (!opal_list_get_size(&orte_oob_base.actives)) { ret = ORTE_ERROR; error = "orte_oob: Found 0 active transports"; goto error; } /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_select"; goto error; } /* select the notifier*/ if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_notifier_base_select"; goto error; } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orcm_db_base_open"; goto error; } /* always restrict daemons to local database components */ if (ORTE_SUCCESS != (ret = orcm_db_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orcm_db_base_select"; goto error; } /* datastore - ensure we don't pickup the pmi component, but * don't override anything set by user */ if (NULL == getenv(OPAL_MCA_PREFIX"dstore")) { putenv(OPAL_MCA_PREFIX"dstore=^pmi"); } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_select"; goto error; } /* create the handle */ if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL", NULL, NULL))) { error = "opal dstore internal"; ret = ORTE_ERR_FATAL; goto error; } /* extract the cluster description and setup the routed info - the orcm routed component * will know what to do. */ n = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "extract cluster buf"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); OBJ_RELEASE(clusterbuf); error = "orte_routed.init_routes"; goto error; } OBJ_RELEASE(clusterbuf); /* extract the uri buffer and load the hash tables */ n = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "extract uri buffer"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); OBJ_RELEASE(uribuf); error = "load hash tables"; goto error; } OBJ_DESTRUCT(&buf); OBJ_RELEASE(uribuf); /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ opal_cr_set_enabled(false); if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the ANALYTICS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_analytics_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_analytics_base_open"; goto error; } /* setup the EVGEN framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_evgen_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_evgen_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orcm_evgen_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_evgen_select"; goto error; } /* setup the SENSOR framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_sensor_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_sensor_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orcm_sensor_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_sensor_select"; goto error; } /* start the local sensors */ orcm_sensor.start(ORTE_PROC_MY_NAME->jobid); /* setup the PWRMGMT framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_pwrmgmt_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_pwrmgmt_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orcm_pwrmgmt_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_pwrmgmt_select"; goto error; } /* setup the DFS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_select"; goto error; } /* open and setup the DIAG framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_diag_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_diag_base_open"; goto error; } if (ORCM_SUCCESS != (ret = orcm_diag_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_diag_select"; goto error; } return ORTE_SUCCESS; error: orte_show_help("help-orcm-runtime.txt", "orcm_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ORTE_ERR_SILENT; }
/** * Discover the available resources. Obtain directly from LoadLeveler (and * therefore have no need to validate) -- ignore hostfile or any other * user-specified parameters. */ static int orte_ras_loadleveler_discover(opal_list_t* nodelist) { orte_node_t *node; opal_list_item_t* item; FILE *fp; char *hostname; char *filename; char input[LL_FILE_MAX_LINE_LENGTH]; /* Ignore anything that the user already specified -- we're getting nodes only from LoadLeveler. */ filename = getenv("LOADL_HOSTFILE"); if(NULL == filename) { opal_output(orte_ras_base_framework.framework_output, "ras:loadleveler:allocate:discover: LOADL_HOSTFILE not set. " "Unable to discover allocated nodes."); return ORTE_ERROR; } fp = fopen(filename, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE); return ORTE_ERR_FILE_OPEN_FAILURE; } /* Iterate through all the nodes and make an entry for each */ while (0 != ll_getline(fp, input)) { hostname = strdup(input); OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s ras:loadleveler:allocate:discover: got hostname %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname)); /* Remember that LoadLeveler may list the same node more than once. So we have to check for duplicates. */ for (item = opal_list_get_first(nodelist); opal_list_get_end(nodelist) != item; item = opal_list_get_next(item)) { node = (orte_node_t*) item; if (0 == strcmp(node->name, hostname)) { ++node->slots; OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s ras:loadleveler:allocate:discover: found -- bumped slots to %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots)); break; } } /* Did we find it? */ if (opal_list_get_end(nodelist) == item) { /* Nope -- didn't find it, so add a new item to the list */ OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output, "%s ras:loadleveler:allocate:discover: not found -- added to list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); node = OBJ_NEW(orte_node_t); node->name = hostname; node->state = ORTE_NODE_STATE_UP; node->slots_inuse = 0; node->slots_max = 0; node->slots = 1; opal_list_append(nodelist, &node->super); } else { /* Yes, so we need to free the hostname that came back */ free(hostname); } } fclose(fp); return ORTE_SUCCESS; }
/* * Initialization of data structures for running under a debugger * using the MPICH/TotalView parallel debugger interface. This stage * of initialization must occur after spawn * * NOTE: We -always- perform this step to ensure that any debugger * that attaches to us post-launch of the application can get a * completed proctable */ void orte_debugger_base_init_after_spawn(orte_job_t *jdata) { orte_proc_t *proc; orte_app_context_t *appctx; orte_vpid_t i, j; opal_buffer_t buf; orte_process_name_t rank0; int rc; if (MPIR_proctable) { /* already initialized */ opal_output_verbose(5, orte_debugger_base.output, "%s: debugger already initialized", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return; } /* fill in the proc table for the application processes */ opal_output_verbose(5, orte_debugger_base.output, "%s: Setting up debugger process table for applications", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); MPIR_debug_state = 1; /* set the total number of processes in the job */ MPIR_proctable_size = jdata->num_procs; /* allocate MPIR_proctable */ MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) * MPIR_proctable_size); if (MPIR_proctable == NULL) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return; } if (orte_debugger_base.dump_proctable) { opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid)); } /* initialize MPIR_proctable */ for (j=0; j < jdata->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { continue; } /* store this data in the location whose index * corresponds to the proc's rank */ i = proc->name.vpid; if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { continue; } MPIR_proctable[i].host_name = strdup(proc->node->name); if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->app, NULL ); } else { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->cwd, appctx->app, NULL ); } MPIR_proctable[i].pid = proc->pid; if (orte_debugger_base.dump_proctable) { opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d", ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name, MPIR_proctable[i].executable_name, MPIR_proctable[i].pid); } } if (0 < opal_output_get_verbosity(orte_debugger_base.output)) { orte_debugger_base_dump(); } /* if we are being launched under a debugger, then we must wait * for it to be ready to go and do some things to start the job */ if (MPIR_being_debugged) { /* wait for all procs to have reported their contact info - this * ensures that (a) they are all into mpi_init, and (b) the system * has the contact info to successfully send a message to rank=0 */ ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs); (void) MPIR_Breakpoint(); /* send a message to rank=0 to release it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */ rank0.jobid = jdata->jobid; rank0.vpid = 0; if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) { opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); } OBJ_DESTRUCT(&buf); } }
void orte_state_base_check_all_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = caddy->jdata; orte_proc_t *proc; int i; orte_std_cntr_t j; orte_job_t *job; orte_node_t *node; orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; orte_vpid_t lowest=0; opal_output_verbose(2, orte_state_base_output, "%s state:base:check_job_complete on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* just check to see if the daemons are complete */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_complete - received NULL job, checking daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CHECK_DAEMONS; } else { /* mark the job as terminated, but don't override any * abnormal termination flags */ if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) { jdata->state = ORTE_JOB_STATE_TERMINATED; } } /* turn off any sensor monitors on this job */ orte_sensor.stop(jdata->jobid); /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } if (0 < jdata->num_non_zero_exit && !orte_abort_non_zero_exit) { if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* update the exit code */ ORTE_UPDATE_EXIT_STATUS(lowest); } /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "While %s job %s terminated normally, %d %s. Further examination may be required.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), jdata->num_non_zero_exit, (1 == jdata->num_non_zero_exit) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); } OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed declared job %s normally terminated - checking all jobs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* if this job is a continuously operating one, then don't do * anything further - just return here */ if (NULL != jdata && (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { goto CHECK_ALIVE; } /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP * that the orteds are complete. Also check special case * if jdata is NULL - we want * to definitely declare the job done if the orteds * have completed, no matter what else may be happening. * This can happen if a ctrl-c hits in the "wrong" place * while launching */ CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { if (0 == orte_routed.num_routes()) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == jdata) { jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); OBJ_RELEASE(caddy); return; } OBJ_RELEASE(caddy); return; } /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". * Note that an aborted/killed job -is- flagged as complete and will * therefore have its resources released. We need to do this after * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (proc->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ node->mapped = false; } OBJ_RELEASE(map); jdata->map = NULL; } CHECK_ALIVE: /* now check to see if all jobs are done - trigger notification of this jdata * object when we find it */ one_still_alive = false; for (j=1; j < orte_job_data->size; j++) { if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { /* since we are releasing jdata objects as we * go, we can no longer assume that the job_data * array is left justified */ continue; } /* if this is the job we are checking AND it normally terminated, * then activate the "notify_completed" state - this will release * the job state, but is provided so that the HNP main code can * take alternative actions if desired. If the state is killed_by_cmd, * then go ahead and release it. We cannot release it if it * abnormally terminated as mpirun needs the info so it can * report appropriately to the user * * NOTE: do not release the primary job (j=1) so we * can pretty-print completion message */ if (NULL != jdata && job->jobid == jdata->jobid) { if (jdata->state == ORTE_JOB_STATE_TERMINATED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed state is terminated - activating notify", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED); one_still_alive = true; } else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD || jdata->state == ORTE_JOB_STATE_NOTIFIED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed state is killed or notified - cleaning up", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* release this object, ensuring that the * pointer array internal accounting * is maintained! */ if (1 < j) { opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ OBJ_RELEASE(jdata); } } continue; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { continue; } /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ if (job->num_terminated < job->num_procs) { /* we have at least one job that is not done yet - we cannot * just return, though, as we need to ensure we cleanout the * job data for the job that just completed */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed job %s is not terminated (%d:%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs)); one_still_alive = true; } else { OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs, (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); } } /* if a job is still alive, we just return */ if (one_still_alive) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed at least one job is not terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(caddy); return; } /* if we get here, then all jobs are done, so terminate */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_output, "%s state:base:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); /* order daemon termination - this tells us to cleanup * our local procs as well as telling remote daemons * to die */ orte_plm.terminate_orteds(); OBJ_RELEASE(caddy); }