void orte_state_base_check_all_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = caddy->jdata; orte_proc_t *proc; int i; orte_std_cntr_t j; orte_job_t *job; orte_node_t *node; orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; orte_vpid_t lowest=0; int32_t i32, *i32ptr; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* just check to see if the daemons are complete */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete - received NULL job, checking daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CHECK_DAEMONS; } else { /* mark the job as terminated, but don't override any * abnormal termination flags */ if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) { jdata->state = ORTE_JOB_STATE_TERMINATED; } } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } i32ptr = &i32; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32) && !orte_abort_non_zero_exit) { if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* update the exit code */ ORTE_UPDATE_EXIT_STATUS(lowest); } /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "While %s job %s terminated normally, %d %s. Further examination may be required.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), i32, (1 == i32) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed declared job %s terminated with state %s - checking all jobs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jdata->state))); /* if this job is a continuously operating one, then don't do * anything further - just return here */ if (NULL != jdata && (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) || ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) { goto CHECK_ALIVE; } /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP * that the orteds are complete. Also check special case * if jdata is NULL - we want * to definitely declare the job done if the orteds * have completed, no matter what else may be happening. * This can happen if a ctrl-c hits in the "wrong" place * while launching */ CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { if (0 == orte_routed.num_routes()) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == jdata) { jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); OBJ_RELEASE(caddy); return; } OBJ_RELEASE(caddy); return; } /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". * Note that an aborted/killed job -is- flagged as complete and will * therefore have its resources released. We need to do this after * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (proc->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } CHECK_ALIVE: /* now check to see if all jobs are done - trigger notification of this jdata * object when we find it */ one_still_alive = false; for (j=1; j < orte_job_data->size; j++) { if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { /* since we are releasing jdata objects as we * go, we can no longer assume that the job_data * array is left justified */ continue; } /* if this is the job we are checking AND it normally terminated, * then activate the "notify_completed" state - this will release * the job state, but is provided so that the HNP main code can * take alternative actions if desired. If the state is killed_by_cmd, * then go ahead and release it. We cannot release it if it * abnormally terminated as mpirun needs the info so it can * report appropriately to the user * * NOTE: do not release the primary job (j=1) so we * can pretty-print completion message */ if (NULL != jdata && job->jobid == jdata->jobid) { if (jdata->state == ORTE_JOB_STATE_TERMINATED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed state is terminated - activating notify", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED); one_still_alive = true; } else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD || jdata->state == ORTE_JOB_STATE_NOTIFIED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed state is killed or notified - cleaning up", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* release this object, ensuring that the * pointer array internal accounting * is maintained! */ if (1 < j) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { /* this was a debugger daemon. notify that a debugger has detached */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); } opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ OBJ_RELEASE(jdata); } } continue; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) { continue; } /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ if (job->num_terminated < job->num_procs) { /* we have at least one job that is not done yet - we cannot * just return, though, as we need to ensure we cleanout the * job data for the job that just completed */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed job %s is not terminated (%d:%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs)); one_still_alive = true; } else { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs, (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); } } /* if a job is still alive, we just return */ if (one_still_alive) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed at least one job is not terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(caddy); return; } /* if we get here, then all jobs are done, so terminate */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* stop the job timeout event, if set */ if (NULL != orte_mpiexec_timeout) { OBJ_RELEASE(orte_mpiexec_timeout); orte_mpiexec_timeout = NULL; } /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); /* order daemon termination - this tells us to cleanup * our local procs as well as telling remote daemons * to die */ orte_plm.terminate_orteds(); OBJ_RELEASE(caddy); }
static int do_parent(orte_app_context_t* context, orte_proc_t *child, char **environ_copy, orte_job_t *jobdat, int read_fd, orte_iof_base_io_conf_t opts) { int rc; orte_odls_pipe_err_msg_t msg; char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL; if (NULL != child && ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) { /* connect endpoints IOF */ rc = orte_iof_base_setup_parent(&child->name, &opts); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); close(read_fd); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } } /* Block reading a message from the pipe */ while (1) { rc = opal_fd_read(read_fd, sizeof(msg), &msg); /* If the pipe closed, then the child successfully launched */ if (OPAL_ERR_TIMEOUT == rc) { break; } /* If Something Bad happened in the read, error out */ if (OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); close(read_fd); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } /* Otherwise, we got a warning or error message from the child */ if (NULL != child) { if (msg.fatal) { ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } else { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE); } } /* Read in the strings; ensure to terminate them with \0 */ if (msg.file_str_len > 0) { rc = opal_fd_read(read_fd, msg.file_str_len, file); if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-default.txt", "syscall fail", true, orte_process_info.nodename, context->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } file[msg.file_str_len] = '\0'; } if (msg.topic_str_len > 0) { rc = opal_fd_read(read_fd, msg.topic_str_len, topic); if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-default.txt", "syscall fail", true, orte_process_info.nodename, context->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } topic[msg.topic_str_len] = '\0'; } if (msg.msg_str_len > 0) { str = calloc(1, msg.msg_str_len + 1); if (NULL == str) { orte_show_help("help-orte-odls-default.txt", "syscall fail", true, orte_process_info.nodename, context->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != child) { child->state = ORTE_PROC_STATE_UNDEF; } return rc; } rc = opal_fd_read(read_fd, msg.msg_str_len, str); } /* Print out what we got. We already have a rendered string, so use orte_show_help_norender(). */ if (msg.msg_str_len > 0) { orte_show_help_norender(file, topic, false, str); free(str); str = NULL; } /* If msg.fatal is true, then the child exited with an error. Otherwise, whatever we just printed was a warning, so loop around and see what else is on the pipe (or if the pipe closed, indicating that the child launched successfully). */ if (msg.fatal) { if (NULL != child) { child->state = ORTE_PROC_STATE_FAILED_TO_START; ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } close(read_fd); return ORTE_ERR_FAILED_TO_START; } } /* If we got here, it means that the pipe closed without indication of a fatal error, meaning that the child process launched successfully. */ if (NULL != child) { child->state = ORTE_PROC_STATE_RUNNING; ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE); } close(read_fd); return ORTE_SUCCESS; }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; int i; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ if (orte_orteds_term_ordered && 0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:base all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } cleanup: OBJ_RELEASE(caddy); }
/* * Query the registry for all nodes allocated to a specified app_context */ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr_t *total_num_slots, orte_app_context_t *app, orte_mapping_policy_t policy, bool initial_map, bool silent) { opal_list_item_t *item, *next; orte_node_t *node, *nd, *nptr; orte_std_cntr_t num_slots; orte_std_cntr_t i; int rc; orte_job_t *daemons; bool novm; opal_list_t nodes; char *hosts; /** set default answer */ *total_num_slots = 0; /* get the daemon job object */ daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); /* see if we have a vm or not */ novm = orte_get_attribute(&daemons->attributes, ORTE_JOB_NO_VM, NULL, OPAL_BOOL); /* if this is NOT a managed allocation, then we use the nodes * that were specified for this app - there is no need to collect * all available nodes and "filter" them */ if (!orte_managed_allocation) { OBJ_CONSTRUCT(&nodes, opal_list_t); /* if the app provided a dash-host, and we are not treating * them as requested or "soft" locations, then use those nodes */ hosts = NULL; if (!orte_soft_locations && orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using dash_host %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts)); if (ORTE_SUCCESS != (rc = orte_util_add_dash_host_nodes(&nodes, hosts, false))) { ORTE_ERROR_LOG(rc); free(hosts); return rc; } free(hosts); } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) { /* otherwise, if the app provided a hostfile, then use that */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hosts)); if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, hosts))) { free(hosts); ORTE_ERROR_LOG(rc); return rc; } free(hosts); } else if (NULL != orte_rankfile) { /* use the rankfile, if provided */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using rankfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_rankfile)); if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_rankfile))) { ORTE_ERROR_LOG(rc); return rc; } if (0 == opal_list_get_size(&nodes)) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s nothing found in given rankfile", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_DESTRUCT(&nodes); return ORTE_ERR_BAD_PARAM; } } else if (NULL != orte_default_hostfile) { /* fall back to the default hostfile, if provided */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using default hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_default_hostfile)); if (ORTE_SUCCESS != (rc = orte_util_add_hostfile_nodes(&nodes, orte_default_hostfile))) { ORTE_ERROR_LOG(rc); return rc; } /* this is a special case - we always install a default * hostfile, but it is empty. If the user didn't remove it * or put something into it, then we will have pursued that * option and found nothing. This isn't an error, we just need * to add all the known nodes */ if (0 == opal_list_get_size(&nodes)) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s nothing in default hostfile - using known nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto addknown; } } else { /* if nothing else was available, then use all known nodes, which * will include ourselves */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s using known nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto addknown; } /** if we still don't have anything */ if (0 == opal_list_get_size(&nodes)) { if (!silent) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); } OBJ_DESTRUCT(&nodes); return ORTE_ERR_SILENT; } /* find the nodes in our node array and assemble them * in daemon order if the vm was launched */ while (NULL != (item = opal_list_remove_first(&nodes))) { nptr = (orte_node_t*)item; nd = NULL; for (i=0; i < orte_node_pool->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (0 != strcmp(node->name, nptr->name)) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s DOESNT MATCH NODE %s", node->name, nptr->name)); continue; } /* ignore nodes that are marked as do-not-use for this mapping */ if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED NO_USE", node->name)); /* reset the state so it can be used another time */ node->state = ORTE_NODE_STATE_UP; continue; } if (ORTE_NODE_STATE_DOWN == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS DOWN", node->name)); continue; } if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED NO_INCLUDE", node->name)); /* not to be used */ continue; } /* if this node wasn't included in the vm (e.g., by -host), ignore it, * unless we are mapping prior to launching the vm */ if (NULL == node->daemon && !novm) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s HAS NO DAEMON", node->name)); continue; } /* retain a copy for our use in case the item gets * destructed along the way */ OBJ_RETAIN(node); if (initial_map) { /* if this is the first app_context we * are getting for an initial map of a job, * then mark all nodes as unmapped */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } if (NULL == nd || NULL == nd->daemon || NULL == node->daemon || nd->daemon->name.vpid < node->daemon->name.vpid) { /* just append to end */ opal_list_append(allocated_nodes, &node->super); nd = node; } else { /* starting from end, put this node in daemon-vpid order */ while (node->daemon->name.vpid < nd->daemon->name.vpid) { if (opal_list_get_begin(allocated_nodes) == opal_list_get_prev(&nd->super)) { /* insert at beginning */ opal_list_prepend(allocated_nodes, &node->super); goto moveon1; } nd = (orte_node_t*)opal_list_get_prev(&nd->super); } item = opal_list_get_next(&nd->super); if (item == opal_list_get_end(allocated_nodes)) { /* we are at the end - just append */ opal_list_append(allocated_nodes, &node->super); } else { nd = (orte_node_t*)item; opal_list_insert_pos(allocated_nodes, item, &node->super); } moveon1: /* reset us back to the end for the next node */ nd = (orte_node_t*)opal_list_get_last(allocated_nodes); } } OBJ_RELEASE(nptr); } OBJ_DESTRUCT(&nodes); /* now prune for usage and compute total slots */ goto complete; } addknown: /* if the hnp was allocated, include it unless flagged not to */ if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) { if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "HNP IS MARKED NO_USE")); /* clear this for future use, but don't include it */ node->state = ORTE_NODE_STATE_UP; } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) { OBJ_RETAIN(node); if (initial_map) { /* if this is the first app_context we * are getting for an initial map of a job, * then mark all nodes as unmapped */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } opal_list_append(allocated_nodes, &node->super); } } } /* add everything in the node pool that can be used - add them * in daemon order, which may be different than the order in the * node pool. Since an empty list is passed into us, the list at * this point either has the HNP node or nothing, and the HNP * node obviously has a daemon on it (us!) */ if (0 == opal_list_get_size(allocated_nodes)) { /* the list is empty */ nd = NULL; } else { nd = (orte_node_t*)opal_list_get_last(allocated_nodes); } for (i=1; i < orte_node_pool->size; i++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { /* ignore nodes that are marked as do-not-use for this mapping */ if (ORTE_NODE_STATE_DO_NOT_USE == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED NO_USE", node->name)); /* reset the state so it can be used another time */ node->state = ORTE_NODE_STATE_UP; continue; } if (ORTE_NODE_STATE_DOWN == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED DOWN", node->name)); continue; } if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s IS MARKED NO_INCLUDE", node->name)); /* not to be used */ continue; } /* if this node wasn't included in the vm (e.g., by -host), ignore it, * unless we are mapping prior to launching the vm */ if (NULL == node->daemon && !novm) { OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base_framework.framework_output, "NODE %s HAS NO DAEMON", node->name)); continue; } /* retain a copy for our use in case the item gets * destructed along the way */ OBJ_RETAIN(node); if (initial_map) { /* if this is the first app_context we * are getting for an initial map of a job, * then mark all nodes as unmapped */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } if (NULL == nd || NULL == nd->daemon || NULL == node->daemon || nd->daemon->name.vpid < node->daemon->name.vpid) { /* just append to end */ opal_list_append(allocated_nodes, &node->super); nd = node; } else { /* starting from end, put this node in daemon-vpid order */ while (node->daemon->name.vpid < nd->daemon->name.vpid) { if (opal_list_get_begin(allocated_nodes) == opal_list_get_prev(&nd->super)) { /* insert at beginning */ opal_list_prepend(allocated_nodes, &node->super); goto moveon; } nd = (orte_node_t*)opal_list_get_prev(&nd->super); } item = opal_list_get_next(&nd->super); if (item == opal_list_get_end(allocated_nodes)) { /* we are at the end - just append */ opal_list_append(allocated_nodes, &node->super); } else { nd = (orte_node_t*)item; opal_list_insert_pos(allocated_nodes, item, &node->super); } moveon: /* reset us back to the end for the next node */ nd = (orte_node_t*)opal_list_get_last(allocated_nodes); } } } OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Starting with %d nodes in list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)opal_list_get_size(allocated_nodes))); /** check that anything is here */ if (0 == opal_list_get_size(allocated_nodes)) { if (!silent) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); } return ORTE_ERR_SILENT; } /* filter the nodes thru any hostfile and dash-host options */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Filtering thru apps", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_rmaps_base_filter_nodes(app, allocated_nodes, true)) && ORTE_ERR_TAKE_NEXT_OPTION != rc) { ORTE_ERROR_LOG(rc); return rc; } OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Retained %d nodes in list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)opal_list_get_size(allocated_nodes))); complete: /* remove all nodes that are already at max usage, and * compute the total number of allocated slots while * we do so */ num_slots = 0; item = opal_list_get_first(allocated_nodes); while (item != opal_list_get_end(allocated_nodes)) { /** save the next pointer in case we remove this node */ next = opal_list_get_next(item); /** check to see if this node is fully used - remove if so */ node = (orte_node_t*)item; if (0 != node->slots_max && node->slots_inuse > node->slots_max) { OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Removing node %s: max %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots_max, node->slots_inuse)); opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ } else if (node->slots <= node->slots_inuse && (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { /* remove the node as fully used */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s Removing node %s slots %d inuse %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots, node->slots_inuse)); opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ } else if (node->slots > node->slots_inuse) { /* add the available slots */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s node %s has %d slots available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots - node->slots_inuse)); num_slots += node->slots - node->slots_inuse; } else if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) { /* nothing needed to do here - we don't add slots to the * count as we don't have any available. Just let the mapper * do what it needs to do to meet the request */ OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output, "%s node %s is fully used, but available for oversubscrition", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); } else { /* if we cannot use it, remove it from list */ opal_list_remove_item(allocated_nodes, item); OBJ_RELEASE(item); /* "un-retain" it */ } /** go on to next item */ item = next; } /* Sanity check to make sure we have resources available */ if (0 == opal_list_get_size(allocated_nodes)) { if (silent) { /* let the caller know that the resources exist, * but are currently busy */ return ORTE_ERR_RESOURCE_BUSY; } else { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:all-available-resources-used", true); return ORTE_ERR_SILENT; } } /* pass back the total number of available slots */ *total_num_slots = num_slots; if (4 < opal_output_get_verbosity(orte_rmaps_base_framework.framework_output)) { opal_output(0, "AVAILABLE NODES FOR MAPPING:"); for (item = opal_list_get_first(allocated_nodes); item != opal_list_get_end(allocated_nodes); item = opal_list_get_next(item)) { node = (orte_node_t*)item; opal_output(0, " node: %s daemon: %s", node->name, (NULL == node->daemon) ? "NULL" : ORTE_VPID_PRINT(node->daemon->name.vpid)); } } return ORTE_SUCCESS; }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_proc_t *pptr, *proct; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; int i; int32_t i32, *i32ptr; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_hnp: for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* could be a race condition */ goto cleanup; } pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); /* we MUST handle a communication failure before doing anything else * as it requires some special care to avoid normal termination issues * for local application procs */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid != proc->jobid) { /* nope - ignore it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure to non-daemon proc - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is my own connection, ignore it */ if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure on my own connection - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* mark the daemon as gone */ ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE); /* if we have ordered orteds to terminate or abort * is in progress, record it */ if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: daemons terminating - recording daemon %s as gone", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); /* if all my routes and local children are gone, then terminate ourselves */ if (0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) { /* at least one is still alive */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: at least one proc (%s) still alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proct->name))); goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr_hnp: all routes and children gone - ordering exit", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: %d routes remain alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: daemon %s - aborting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* record the first one to fail */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { /* output an error message so the user knows what happened */ orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name); /* mark the daemon job as failed */ jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); /* update our exit code */ ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* just in case the exit code hadn't been set, do it here - this * won't override any reported exit code */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE); } /* abort the system */ default_hnp_abort(jdata); goto cleanup; } /* update the proc state - can get multiple reports on a proc * depending on circumstances, so ensure we only do this once */ if (pptr->state < ORTE_PROC_STATE_TERMINATED) { pptr->state = state; } /* if we were ordered to terminate, mark this proc as dead and see if * any of our routes or local children remain alive - if not, then * terminate ourselves. */ if (orte_orteds_term_ordered) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { goto keep_going; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:hnp all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } } keep_going: /* ensure we record the failed proc properly so we can report * the error once we terminate */ switch (state) { case ORTE_PROC_STATE_KILLED_BY_CMD: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s killed by cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* we ordered this proc to die, so it isn't an abnormal termination * and we don't flag it as such */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } /* don't abort the job as this isn't an abnormal termination */ break; case ORTE_PROC_STATE_ABORTED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s aborted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_ABORTED_BY_SIG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s aborted by signal", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* track the number of non-zero exits */ i32 = 0; i32ptr = &i32; orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32); ++i32; orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32); if (orte_abort_non_zero_exit) { if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } } else { /* user requested we consider this normal termination */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } break; case ORTE_PROC_STATE_TERM_WO_SYNC: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s terminated without sync", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* now treat a special case - if the proc exit'd without a required * sync, it may have done so with a zero exit code. We want to ensure * that the user realizes there was an error, so in this -one- case, * we overwrite the process' exit code with the default error code */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_FAILED_TO_START: case ORTE_PROC_STATE_FAILED_TO_LAUNCH: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { if (ORTE_PROC_STATE_FAILED_TO_START) { jdata->state = ORTE_JOB_STATE_FAILED_TO_START; } else { jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH; } /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } /* if this was a daemon, report it */ if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* output a message indicating we failed to launch a daemon */ orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); } break; case ORTE_PROC_STATE_CALLED_ABORT: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s called abort with exit code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), pptr->exit_code)); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_CALLED_ABORT; /* point to the first proc to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_TERM_NON_ZERO: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s exited with non-zero status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), pptr->exit_code)); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* track the number of non-zero exits */ i32 = 0; i32ptr = &i32; orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32); ++i32; orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32); if (orte_abort_non_zero_exit) { if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } } else { /* user requested we consider this normal termination */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } break; case ORTE_PROC_STATE_HEARTBEAT_FAILED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s heartbeat failed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); break; case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: unable to send message to proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if this proc is one of my daemons, then we are truly * hosed - so just exit out */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); break; } if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; default: /* shouldn't get this, but terminate job if required */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s default error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } break; } /* if the waitpid fired, be sure to let the state machine know */ if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED); } cleanup: OBJ_RELEASE(caddy); }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; char *rtmod; orte_std_cntr_t index; orte_job_map_t *map; orte_node_t *node; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, notify the HNP */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); /* Release the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process * Do this after we handle termination in case the IOF needs * to check to see if all procs from the job are actually terminated */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDALL); } if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); if (orte_orteds_term_ordered && 0 == orte_routed.num_routes(rtmod)) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted all routes gone but proc %s still alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pdata->name))); goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* cleanup the procs as these are gone */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } /* if this child is part of the job... */ if (pptr->name.jobid == jdata->jobid) { /* clear the entry in the local children */ opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(pptr); // maintain accounting } } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } /* tell the PMIx subsystem the job is complete */ if (NULL != opal_pmix.server_deregister_nspace) { opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL); } /* release the resources */ if (NULL != jdata->map) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:orted releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (pptr->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:orted releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(pptr); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } /* cleanup the job info */ opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL); OBJ_RELEASE(jdata); } } cleanup: OBJ_RELEASE(caddy); }
/* * Sequentially map the ranks according to the placement in the * specified hostfile */ static int orte_rmaps_seq_map(orte_job_t *jdata) { orte_job_map_t *map; orte_app_context_t *app; int i, n; orte_std_cntr_t j; opal_list_item_t *item; orte_node_t *node, *nd; seq_node_t *sq, *save=NULL, *seq;; orte_vpid_t vpid; orte_std_cntr_t num_nodes; int rc; opal_list_t default_seq_list; opal_list_t node_list, *seq_list, sq_list; orte_proc_t *proc; mca_base_component_t *c = &mca_rmaps_seq_component.base_version; char *hosts = NULL, *sep, *eptr; FILE *fp; opal_hwloc_resource_type_t rtype; OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base_framework.framework_output, "%s rmaps:seq called on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* this mapper can only handle initial launch * when seq mapping is desired - allow * restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s is being restarted - seq cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper) { if (0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s not using sequential mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } /* we need to process it */ goto process; } if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s not using seq mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } process: opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* convenience def */ map = jdata->map; /* if there is a default hostfile, go and get its ordered list of nodes */ OBJ_CONSTRUCT(&default_seq_list, opal_list_t); if (NULL != orte_default_hostfile) { char *hstname = NULL; /* open the file */ fp = fopen(orte_default_hostfile, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto error; } while (NULL != (hstname = orte_getline(fp))) { if (0 == strlen(hstname)) { free(hstname); /* blank line - ignore */ continue; } if( '#' == hstname[0] ) { free(hstname); /* Comment line - ignore */ continue; } sq = OBJ_NEW(seq_node_t); if (NULL != (sep = strchr(hstname, ' '))) { *sep = '\0'; sep++; /* remove any trailing space */ eptr = sep + strlen(sep) - 1; while (eptr > sep && isspace(*eptr)) { eptr--; } *(eptr+1) = 0; sq->cpuset = strdup(sep); } // Strip off the FQDN if present, ignore IP addresses if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) { char *ptr; if (NULL != (ptr = strchr(hstname, '.'))) { *ptr = '\0'; } } sq->hostname = hstname; opal_list_append(&default_seq_list, &sq->super); } fclose(fp); } /* start at the beginning... */ vpid = 0; jdata->num_procs = 0; if (0 < opal_list_get_size(&default_seq_list)) { save = (seq_node_t*)opal_list_get_first(&default_seq_list); } /* default to LOGICAL processors */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, NULL, OPAL_BOOL)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using PHYSICAL processors"); rtype = OPAL_HWLOC_PHYSICAL; } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using LOGICAL processors"); rtype = OPAL_HWLOC_LOGICAL; } /* initialize all the nodes as not included in this job map */ for (j=0; j < orte_node_pool->size; j++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } } /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* dash-host trumps hostfile */ if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using dash-host nodes on app %s", app->app); OBJ_CONSTRUCT(&node_list, opal_list_t); /* dash host entries cannot specify cpusets, so used the std function to retrieve the list */ if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(&node_list, hosts))) { ORTE_ERROR_LOG(rc); free(hosts); goto error; } free(hosts); /* transfer the list to a seq_node_t list */ OBJ_CONSTRUCT(&sq_list, opal_list_t); while (NULL != (nd = (orte_node_t*)opal_list_remove_first(&node_list))) { sq = OBJ_NEW(seq_node_t); sq->hostname = strdup(nd->name); opal_list_append(&sq_list, &sq->super); OBJ_RELEASE(nd); } OBJ_DESTRUCT(&node_list); seq_list = &sq_list; } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) { char *hstname; if (NULL == hosts) { rc = ORTE_ERR_NOT_FOUND; goto error; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using hostfile %s nodes on app %s", hosts, app->app); OBJ_CONSTRUCT(&sq_list, opal_list_t); /* open the file */ fp = fopen(hosts, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; OBJ_DESTRUCT(&sq_list); goto error; } while (NULL != (hstname = orte_getline(fp))) { if (0 == strlen(hstname)) { free(hstname); /* blank line - ignore */ continue; } if( '#' == hstname[0] ) { free(hstname); /* Comment line - ignore */ continue; } sq = OBJ_NEW(seq_node_t); if (NULL != (sep = strchr(hstname, ' '))) { *sep = '\0'; sep++; /* remove any trailing space */ eptr = sep + strlen(sep) - 1; while (eptr > sep && isspace(*eptr)) { eptr--; } *(eptr+1) = 0; sq->cpuset = strdup(sep); } // Strip off the FQDN if present, ignore IP addresses if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) { char *ptr; if (NULL != (ptr = strchr(hstname, '.'))) { (*ptr) = '\0'; } } sq->hostname = hstname; opal_list_append(&sq_list, &sq->super); } fclose(fp); free(hosts); seq_list = &sq_list; } else if (0 < opal_list_get_size(&default_seq_list)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using default hostfile nodes on app %s", app->app); seq_list = &default_seq_list; } else { /* can't do anything - no nodes available! */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* check for nolocal and remove the head node, if required */ if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) { for (item = opal_list_get_first(seq_list); item != opal_list_get_end(seq_list); item = opal_list_get_next(item) ) { seq = (seq_node_t*)item; /* need to check ifislocal because the name in the * hostfile may not have been FQDN, while name returned * by gethostname may have been (or vice versa) */ if (orte_ifislocal(seq->hostname)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: removing head node %s", seq->hostname); opal_list_remove_item(seq_list, item); OBJ_RELEASE(item); /* "un-retain" it */ } } } if (NULL == seq_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(seq_list))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* if num_procs wasn't specified, set it now */ if (0 == app->num_procs) { app->num_procs = num_nodes; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: setting num procs to %s for app %s", ORTE_VPID_PRINT(app->num_procs), app->app); } else if (num_nodes < app->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "seq:not-enough-resources", true, app->num_procs, num_nodes); return ORTE_ERR_SILENT; } if (seq_list == &default_seq_list) { sq = save; } else { sq = (seq_node_t*)opal_list_get_first(seq_list); } for (n=0; n < app->num_procs; n++) { /* find this node on the global array - this is necessary so * that our mapping gets saved on that array as the objects * returned by the hostfile function are -not- on the array */ node = NULL; for (j=0; j < orte_node_pool->size; j++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { continue; } if (0 == strcmp(sq->hostname, node->name)) { break; } } if (NULL == node) { /* wasn't found - that is an error */ orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:resource-not-found", true, sq->hostname); rc = ORTE_ERR_SILENT; goto error; } /* ensure the node is in the map */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { OBJ_RETAIN(node); opal_pointer_array_add(map->nodes, node); jdata->map->num_nodes++; ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); } proc = orte_rmaps_base_setup_proc(jdata, node, i); if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED); /* check for permission */ if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) { /* if we weren't given a directive either way, then we will error out * as the #slots were specifically given, either by the host RM or * via hostfile/dash-host */ if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERR_SILENT; } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { /* if we were explicitly told not to oversubscribe, then don't */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERR_SILENT; } } } /* assign the vpid */ proc->name.vpid = vpid++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: assign proc %s to node %s for app %s", ORTE_VPID_PRINT(proc->name.vpid), sq->hostname, app->app); /* record the cpuset, if given */ if (NULL != sq->cpuset) { hwloc_cpuset_t bitmap; char *cpu_bitmap; if (NULL == node->topology) { /* not allowed - for sequential cpusets, we must have * the topology info */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name); rc = ORTE_ERR_SILENT; goto error; } /* if we are using hwthreads as cpus and binding to hwthreads, then * we can just copy the cpuset across as it already specifies things * at that level */ if (opal_hwloc_use_hwthreads_as_cpus && OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { cpu_bitmap = strdup(sq->cpuset); } else { /* setup the bitmap */ bitmap = hwloc_bitmap_alloc(); /* parse the slot_list to find the socket and core */ if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(sq->cpuset, node->topology, rtype, bitmap))) { ORTE_ERROR_LOG(rc); hwloc_bitmap_free(bitmap); goto error; } /* note that we cannot set the proc locale to any specific object * as the slot list may have assigned it to more than one - so * leave that field NULL */ /* set the proc to the specified map */ hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap); hwloc_bitmap_free(bitmap); } orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: binding proc %s to cpuset %s bitmap %s", ORTE_VPID_PRINT(proc->name.vpid), sq->cpuset, cpu_bitmap); /* we are going to bind to cpuset since the user is specifying the cpus */ OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CPUSET); /* note that the user specified the mapping */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYUSER); ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN); /* cleanup */ free(cpu_bitmap); } else { hwloc_obj_t locale; /* assign the locale - okay for the topo to be null as * it just means it wasn't returned */ if (NULL != node->topology) { locale = hwloc_get_root_obj(node->topology); orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, locale, OPAL_PTR); } } /* add to the jdata proc array */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); goto error; } /* move to next node */ sq = (seq_node_t*)opal_list_get_next(&sq->super); } /** track the total number of processes we mapped */ jdata->num_procs += app->num_procs; /* cleanup the node list if it came from this app_context */ if (seq_list != &default_seq_list) { OPAL_LIST_DESTRUCT(seq_list); } else { save = sq; } } return ORTE_SUCCESS; error: OPAL_LIST_DESTRUCT(&default_seq_list); return rc; }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_proc_t *child, *ptr; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; int i; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors finalizing - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_LIFELINE_LOST == state || ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:orted lifeline lost - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set our exit status */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - we can't seem to trust that we will catch the waitpid * in this situation, so push this over to be handled as if * it were a waitpid trigger so we don't create a bunch of * duplicate code */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* leave the exit code alone - process this as a waitpid */ odls_base_default_wait_local_proc(child, NULL); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted daemon %s exited", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if we are using static ports, then it is possible that the HNP * will not see this termination. So if the HNP didn't order us * to terminate, then we should ensure it knows */ if (orte_static_ports && !orte_orteds_term_ordered) { /* send an alert to the HNP */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* set the exit code to reflect the problem */ child->exit_code = ORTE_ERR_COMM_FAILURE; /* pack only the data for this daemon - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the daemon's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting lost connection to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* continue on */ goto cleanup; } if (orte_orteds_term_ordered) { /* are any of my children still alive */ for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s errmgr:default:orted[%s(%d)] proc %s is alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, ORTE_NAME_PRINT(&child->name))); goto cleanup; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted not exiting, num_routes() == %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } } /* if not, then we can continue */ goto cleanup; } if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* if this is not a local proc for this job, we can * ignore this call */ if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors proc is not local - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc))); if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { /* update the state */ child->state = state; /* report this as abnormal termination to the HNP, unless we already have * done so for this job */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } if (ORTE_PROC_STATE_FAILED_TO_START == state || ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { /* update the proc state */ child->state = state; /* count the proc as having "terminated" */ jdata->num_terminated++; /* leave the error report in this case to the * state machine, which will receive notice * when all local procs have attempted to start * so that we send a consolidated error report * back to the HNP */ goto cleanup; } if (ORTE_PROC_STATE_TERMINATED < state) { /* if we were ordered to terminate, see if * any of our routes or local children remain alive - if not, then * terminate ourselves. */ if (orte_orteds_term_ordered) { /* mark the child as no longer alive and update the counters, if necessary. * we have to do this here as we aren't going to send this to the state * machine, and we want to keep the bookkeeping accurate just in case */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED); jdata->num_terminated++; } for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { goto keep_going; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } /* no need to alert the HNP - we are already on our way out */ goto cleanup; } keep_going: /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away - but * only do this once! */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (jdata->jobid == ptr->name.jobid) { opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } cleanup: OBJ_RELEASE(caddy); }
static void check_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = caddy->jdata; orte_proc_t *proc; int i; orte_node_t *node; orte_job_map_t *map; orte_std_cntr_t index; char *rtmod; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:dvm:check_job_complete on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* just check to see if the daemons are complete */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:dvm:check_job_complete - received NULL job, checking daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); rtmod = orte_rml.get_routed(orte_mgmt_conduit); if (0 == orte_routed.num_routes(rtmod)) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == jdata) { jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); OBJ_RELEASE(caddy); return; } OBJ_RELEASE(caddy); return; } /* mark the job as terminated, but don't override any * abnormal termination flags */ if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) { jdata->state = ORTE_JOB_STATE_TERMINATED; } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } /* tell the PMIx subsystem the job is complete */ if (NULL != opal_pmix.server_deregister_nspace) { opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL); } /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". * Note that an aborted/killed job -is- flagged as complete and will * therefore have its resources released. We need to do this after * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:dvm releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (proc->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:dvm releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { /* this was a debugger daemon. notify that a debugger has detached */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); } else if (jdata->state != ORTE_JOB_STATE_NOTIFIED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:dvm:check_job_completed state is terminated - activating notify", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED); /* mark the job as notified */ jdata->state = ORTE_JOB_STATE_NOTIFIED; } OBJ_RELEASE(caddy); }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; int8_t flag; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (NULL == pdata) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_AS_MPI)) { flag = 1; } else { flag = 0; } if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &flag, 1, OPAL_INT8))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } } cleanup: OBJ_RELEASE(caddy); }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; int i; char *rtmod; orte_process_name_t parent, target, *npptr; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get our "lifeline" routed module */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* tell the PMIx subsystem to cleanup this client */ opal_pmix.server_deregister_client(proc, NULL, NULL); /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ if (orte_orteds_term_ordered && 0 == orte_routed.num_routes(rtmod)) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:base all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); /* if they requested notification upon completion, provide it */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) { /* notify_completion => notify the parent of the termination * of this child job. So get the parent jobid info */ npptr = &parent; if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) { /* notify everyone who asked for it */ target.jobid = jdata->jobid; target.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD); } else { target.jobid = jdata->jobid; target.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent); } } } else if (ORTE_PROC_STATE_TERMINATED < pdata->state && !orte_job_term_ordered) { /* if this was an abnormal term, notify the other procs of the termination */ parent.jobid = jdata->jobid; parent.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent); } } cleanup: OBJ_RELEASE(caddy); }
static void track_procs(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:staged_hnp:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); OBJ_RELEASE(caddy); return; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } /* if this is a registration, check to see if it came from * inside MPI_Init - if it did, that is not acceptable */ if (ORTE_PROC_STATE_REGISTERED == state) { if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_AS_MPI) && !ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_GANG_LAUNCHED)) { /* we can't support this - issue an error and abort */ orte_show_help("help-state-staged-hnp.txt", "mpi-procs-not-supported", true); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT); } /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } OBJ_RELEASE(caddy); return; } if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { goto terminated; } OBJ_RELEASE(caddy); return; } if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { goto terminated; } OBJ_RELEASE(caddy); return; } /* if the proc terminated, see if any other procs are * waiting to run. We assume that the app_contexts are * in priority order, with the highest priority being * at position 0 in the app_context array for this job */ if (ORTE_PROC_STATE_TERMINATED == state) { terminated: /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = ORTE_PROC_STATE_TERMINATED; if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { /* no other procs are waiting, so end this job */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } else if (jdata->num_mapped < jdata->num_procs) { /* schedule the job for re-mapping so that procs * waiting for resources can execute */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP); } /* otherwise, do nothing until more procs terminate */ OBJ_RELEASE(caddy); return; } }
static void setup_job_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = caddy->jdata; int i, j; orte_app_context_t *app; orte_proc_t *proc; orte_vpid_t vpid; opal_buffer_t *buf; /* check that the job meets our requirements */ vpid = 0; for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } if (app->num_procs <= 0) { /* must specify -np for staged_hnp execution */ orte_show_help("help-state-staged-hnp.txt", "no-np", true); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT); OBJ_RELEASE(caddy); return; } /* build the proc arrays - we'll need them later */ for (j=0; j < app->num_procs; j++) { proc = OBJ_NEW(orte_proc_t); proc->name.jobid = jdata->jobid; proc->name.vpid = vpid; proc->app_idx = i; proc->app_rank = j; /* flag that the proc is NOT to be included * in a pidmap message so we don't do it until * the proc is actually scheduled for launch */ ORTE_FLAG_UNSET(proc, ORTE_PROC_FLAG_UPDATED); /* procs must not barrier when executing in stages */ orte_set_attribute(&proc->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_GLOBAL, NULL, OPAL_BOOL); /* add it to the job */ opal_pointer_array_set_item(jdata->procs, vpid, proc); jdata->num_procs++; vpid++; /* add it to the app */ OBJ_RETAIN(proc); opal_pointer_array_set_item(&app->procs, j, proc); } } /* set the job map to use the staged_hnp mapper */ if (NULL == jdata->map) { jdata->map = OBJ_NEW(orte_job_map_t); jdata->map->req_mapper = strdup("staged"); ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_STAGED); ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE); jdata->map->display_map = orte_rmaps_base.display_map; } /* if there are any file_maps attached to this job, load them */ buf = NULL; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FILE_MAPS, (void**)&buf, OPAL_BUFFER)) { orte_dfs.load_file_maps(jdata->jobid, buf, NULL, NULL); OBJ_RELEASE(buf); } orte_plm_base_setup_job_complete(0, 0, (void*)caddy); }
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd) { int rc; orte_odls_pipe_err_msg_t msg; char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL; if (cd->opts.connect_stdin) { close(cd->opts.p_stdin[0]); } close(cd->opts.p_stdout[1]); if( !orte_iof_base.redirect_app_stderr_to_stdout ) { close(cd->opts.p_stderr[1]); } #if OPAL_PMIX_V1 close(cd->opts.p_internal[1]); #endif /* Block reading a message from the pipe */ while (1) { rc = opal_fd_read(read_fd, sizeof(msg), &msg); /* If the pipe closed, then the child successfully launched */ if (OPAL_ERR_TIMEOUT == rc) { break; } /* If Something Bad happened in the read, error out */ if (OPAL_SUCCESS != rc) { ORTE_ERROR_LOG(rc); close(read_fd); if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_UNDEF; } return rc; } /* Otherwise, we got a warning or error message from the child */ if (NULL != cd->child) { if (msg.fatal) { ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE); } else { ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE); } } /* Read in the strings; ensure to terminate them with \0 */ if (msg.file_str_len > 0) { rc = opal_fd_read(read_fd, msg.file_str_len, file); if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, orte_process_info.nodename, cd->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_UNDEF; } return rc; } file[msg.file_str_len] = '\0'; } if (msg.topic_str_len > 0) { rc = opal_fd_read(read_fd, msg.topic_str_len, topic); if (OPAL_SUCCESS != rc) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, orte_process_info.nodename, cd->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_UNDEF; } return rc; } topic[msg.topic_str_len] = '\0'; } if (msg.msg_str_len > 0) { str = calloc(1, msg.msg_str_len + 1); if (NULL == str) { orte_show_help("help-orte-odls-alps.txt", "syscall fail", true, orte_process_info.nodename, cd->app, "opal_fd_read", __FILE__, __LINE__); if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_UNDEF; } return rc; } rc = opal_fd_read(read_fd, msg.msg_str_len, str); } /* Print out what we got. We already have a rendered string, so use orte_show_help_norender(). */ if (msg.msg_str_len > 0) { orte_show_help_norender(file, topic, false, str); free(str); str = NULL; } /* If msg.fatal is true, then the child exited with an error. Otherwise, whatever we just printed was a warning, so loop around and see what else is on the pipe (or if the pipe closed, indicating that the child launched successfully). */ if (msg.fatal) { if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_FAILED_TO_START; ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE); } close(read_fd); return ORTE_ERR_FAILED_TO_START; } } /* If we got here, it means that the pipe closed without indication of a fatal error, meaning that the child process launched successfully. */ if (NULL != cd->child) { cd->child->state = ORTE_PROC_STATE_RUNNING; ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE); } close(read_fd); return ORTE_SUCCESS; }