static void track_jobs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc; if (ORTE_JOB_STATE_LOCAL_LAUNCH_COMPLETE == caddy->job_state) { OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted:track_jobs sending local launch complete for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(caddy->jdata->jobid))); /* update the HNP with all proc states for this job */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, caddy->jdata))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); goto cleanup; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } } cleanup: OBJ_RELEASE(caddy); }
static void send_fms(opal_buffer_t *bptr, void *cbdata) { orte_proc_t *pdata = (orte_proc_t*)cbdata; orte_proc_t *pptr; orte_job_t *jdata; opal_buffer_t *xfer, *alert; orte_dfs_cmd_t cmd = ORTE_DFS_RELAY_POSTS_CMD; int rc, i; orte_plm_cmd_flag_t cmd2; /* we will get a NULL buffer if there are no maps, so check and * ignore sending an update if that's the case */ if (NULL != bptr) { opal_output_verbose(1, orte_state_base_framework.framework_output, "%s SENDING FILE MAPS FOR %s OF SIZE %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pdata->name), (int)bptr->bytes_used); xfer = OBJ_NEW(opal_buffer_t); if (OPAL_SUCCESS != (rc = opal_dss.pack(xfer, &cmd, 1, ORTE_DFS_CMD_T))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(xfer); return; } if (OPAL_SUCCESS != (rc = opal_dss.pack(xfer, &pdata->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(xfer); return; } if (OPAL_SUCCESS != (rc = opal_dss.pack(xfer, &bptr, 1, OPAL_BUFFER))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(xfer); return; } if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer, ORTE_RML_TAG_DFS_CMD, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(xfer); return; } } /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(&pdata->name); /* alert the HNP */ cmd2 = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd2, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); return; } /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(pdata->name.jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return; } /* pack the info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata, pdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s SENDING TERMINATION UPDATE FOR PROC %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pdata->name))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* find this proc in the children array and remove it so * we don't keep telling the HNP that it died */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr == pdata) { opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(pdata); break; } } }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; char *rtmod; orte_std_cntr_t index; orte_job_map_t *map; orte_node_t *node; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, notify the HNP */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); /* Release the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process * Do this after we handle termination in case the IOF needs * to check to see if all procs from the job are actually terminated */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDALL); } if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); if (orte_orteds_term_ordered && 0 == orte_routed.num_routes(rtmod)) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted all routes gone but proc %s still alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pdata->name))); goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* cleanup the procs as these are gone */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } /* if this child is part of the job... */ if (pptr->name.jobid == jdata->jobid) { /* clear the entry in the local children */ opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(pptr); // maintain accounting } } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } /* tell the PMIx subsystem the job is complete */ if (NULL != opal_pmix.server_deregister_nspace) { opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL); } /* release the resources */ if (NULL != jdata->map) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:orted releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (pptr->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:orted releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(pptr); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } /* cleanup the job info */ opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL); OBJ_RELEASE(jdata); } } cleanup: OBJ_RELEASE(caddy); }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_proc_t *child, *ptr; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; int i; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors finalizing - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_LIFELINE_LOST == state || ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:orted lifeline lost - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set our exit status */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - we can't seem to trust that we will catch the waitpid * in this situation, so push this over to be handled as if * it were a waitpid trigger so we don't create a bunch of * duplicate code */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* leave the exit code alone - process this as a waitpid */ odls_base_default_wait_local_proc(child, NULL); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted daemon %s exited", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if we are using static ports, then it is possible that the HNP * will not see this termination. So if the HNP didn't order us * to terminate, then we should ensure it knows */ if (orte_static_ports && !orte_orteds_term_ordered) { /* send an alert to the HNP */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* set the exit code to reflect the problem */ child->exit_code = ORTE_ERR_COMM_FAILURE; /* pack only the data for this daemon - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the daemon's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting lost connection to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* continue on */ goto cleanup; } if (orte_orteds_term_ordered) { /* are any of my children still alive */ for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s errmgr:default:orted[%s(%d)] proc %s is alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, ORTE_NAME_PRINT(&child->name))); goto cleanup; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted not exiting, num_routes() == %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } } /* if not, then we can continue */ goto cleanup; } if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* if this is not a local proc for this job, we can * ignore this call */ if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors proc is not local - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc))); if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { /* update the state */ child->state = state; /* report this as abnormal termination to the HNP, unless we already have * done so for this job */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } if (ORTE_PROC_STATE_FAILED_TO_START == state || ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { /* update the proc state */ child->state = state; /* count the proc as having "terminated" */ jdata->num_terminated++; /* leave the error report in this case to the * state machine, which will receive notice * when all local procs have attempted to start * so that we send a consolidated error report * back to the HNP */ goto cleanup; } if (ORTE_PROC_STATE_TERMINATED < state) { /* if we were ordered to terminate, see if * any of our routes or local children remain alive - if not, then * terminate ourselves. */ if (orte_orteds_term_ordered) { /* mark the child as no longer alive and update the counters, if necessary. * we have to do this here as we aren't going to send this to the state * machine, and we want to keep the bookkeeping accurate just in case */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED); jdata->num_terminated++; } for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { goto keep_going; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } /* no need to alert the HNP - we are already on our way out */ goto cleanup; } keep_going: /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away - but * only do this once! */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (jdata->jobid == ptr->name.jobid) { opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } cleanup: OBJ_RELEASE(caddy); }
static void job_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_job_state_t jobstate; int rc; orte_plm_cmd_flag_t cmd; opal_buffer_t *alert; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return; } /* if the jdata is NULL, then we abort as this * is reporting an unrecoverable error */ if (NULL == caddy->jdata) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FORCED_EXIT); OBJ_RELEASE(caddy); return; } /* update the state */ jdata = caddy->jdata; jobstate = caddy->job_state; jdata->state = jobstate; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted: job %s reported error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jobstate))); switch (jobstate) { case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jdata); break; case ORTE_JOB_STATE_COMM_FAILED: /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* order termination */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* let the HNP handle this */ goto cleanup; break; default: break; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); goto cleanup; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } cleanup: OBJ_RELEASE(caddy); }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; int8_t flag; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (NULL == pdata) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_AS_MPI)) { flag = 1; } else { flag = 0; } if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &flag, 1, OPAL_INT8))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } } cleanup: OBJ_RELEASE(caddy); }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_proc_t *pptr; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_proc_t *child, *ptr; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; orte_ns_cmp_bitmask_t mask=ORTE_NS_CMP_ALL; int i; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { goto cleanup; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - ignore */ goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default:orted daemon %s exited", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:orted daemon %s was a lifeline - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* are any of my children still alive */ for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (child->alive && child->state < ORTE_PROC_STATE_UNTERMINATED) { goto cleanup; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_TERMINATE(0); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default:orted not exiting, num_routes() == %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } /* if not, then we can continue */ goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ goto cleanup; } pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); /* if there are no local procs for this job, we can * ignore this call */ if (0 == jdata->num_local_procs) { goto cleanup; } /* find this proc in the local children */ child = NULL; for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &ptr->name, proc)) { child = ptr; break; } } if (NULL == child) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc))); if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { child->state = state; /* Decrement the number of local procs */ jdata->num_local_procs--; /* kill this proc */ killprocs(proc->jobid, proc->vpid); goto cleanup; } if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { if (!orte_abort_non_zero_exit) { /* leave the child in orte_local_children so we can * later send the state info after full job termination */ child->state = state; child->waitpid_recvd = true; if (child->iof_complete) { /* the proc has terminated */ child->alive = false; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(&child->name); /* track job status */ jdata->num_terminated++; } /* treat this as normal termination */ goto REPORT_STATE; } /* report this as abnormal termination to the HNP */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* remove the child from our local array as it is no longer alive */ opal_pointer_array_set_item(orte_local_children, i, NULL); /* Decrement the number of local procs */ jdata->num_local_procs--; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* release the child object */ OBJ_RELEASE(child); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } return; } if (ORTE_PROC_STATE_FAILED_TO_START == state || ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { /* update the proc state */ child->state = state; /* count the proc as having "terminated" */ jdata->num_terminated++; /* leave the error report in this case to the * state machine, which will receive notice * when all local procs have attempted to start * so that we send a consolidated error report * back to the HNP */ goto cleanup; } if (ORTE_PROC_STATE_TERMINATED < state) { /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* remove the child from our local array as it is no longer alive */ opal_pointer_array_set_item(orte_local_children, i, NULL); /* Decrement the number of local procs */ jdata->num_local_procs--; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* release the child object */ OBJ_RELEASE(child); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } REPORT_STATE: if (ORTE_PROC_STATE_REGISTERED == state) { /* see if everyone in this job has registered */ if (all_children_registered(proc->jobid)) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted: sending contact info to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack init routes command */ cmd = ORTE_PLM_INIT_ROUTES_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (ptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &ptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return; } } } /* pack an invalid marker */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return; } /* add in contact info for all procs in the job */ if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&alert); return; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } } return; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list - do not lock * the thread as we are already locked */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (jdata->jobid == ptr->name.jobid) { opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } cleanup: OBJ_RELEASE(caddy); }
static int update_state(orte_jobid_t job, orte_job_state_t jobstate, orte_process_name_t *proc, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code) { opal_list_item_t *item, *next; orte_odls_job_t *jobdat = NULL; orte_odls_child_t *child; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; orte_ns_cmp_bitmask_t mask; /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, "errmgr:default_orted:update_state() %s) " "------- %s state updated for process %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ((NULL == proc) ? "App. Process" : (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { return ORTE_SUCCESS; } /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* this is an update for an entire job */ if (ORTE_JOBID_INVALID == job) { /* whatever happened, we don't know what job * it happened to */ orte_show_help("help-orte-errmgr.txt", "errmgr:unknown-job-error", true, orte_job_state_to_str(jobstate)); alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return rc; } /* pack the "invalid" jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == job) { break; } } if (NULL == jobdat) { return ORTE_ERR_NOT_FOUND; } switch (jobstate) { case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jobdat, exit_code); break; case ORTE_JOB_STATE_RUNNING: /* update all local child states */ update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); break; case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: /* update all procs in job */ update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); /* order all local procs for this job to be killed */ killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); case ORTE_JOB_STATE_COMM_FAILED: /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* tell the caller we can't recover */ return ORTE_ERR_UNRECOVERABLE; break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* let the HNP handle this */ return ORTE_SUCCESS; break; default: break; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { ORTE_ERROR_LOG(rc); } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { return ORTE_SUCCESS; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - ignore */ return ORTE_SUCCESS; } /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ orte_quit(); } /* if not, then indicate we can continue */ return ORTE_SUCCESS; } /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; } } if (NULL == jobdat) { /* must already be complete */ return ORTE_SUCCESS; } /* if there are no local procs for this job, we can * ignore this call */ if (0 == jobdat->num_local_procs) { return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted got state %s for proc %s pid %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc), pid)); /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { /* find this proc in the local children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; /* Decrement the number of local procs */ jobdat->num_local_procs--; /* kill this proc */ killprocs(proc->jobid, proc->vpid); } return ORTE_SUCCESS; } } } if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { if (!orte_abort_non_zero_exit) { /* treat this as normal termination */ goto REPORT_STATE; } } if (ORTE_PROC_STATE_TERMINATED < state) { /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } /* find this proc in the local children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; } /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return rc; } /* remove the child from our local list as it is no longer alive */ opal_list_remove_item(&orte_local_children, &child->super); /* Decrement the number of local procs */ jobdat->num_local_procs--; OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), jobdat->num_local_procs)); /* release the child object */ OBJ_RELEASE(child); /* done with loop */ break; } } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } return rc; } REPORT_STATE: /* find this proc in the local children so we can update its state */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; if (0 < pid) { child->pid = pid; } child->exit_code = exit_code; } /* done with loop */ break; } } if (ORTE_PROC_STATE_REGISTERED == state) { /* see if everyone in this job has registered */ if (all_children_registered(proc->jobid)) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted: sending contact info to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack init routes command */ cmd = ORTE_PLM_INIT_ROUTES_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack all the local child vpids and epochs */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; if (child->name->jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->name->vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } } } /* pack an invalid marker */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* add in contact info for all procs in the job */ if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, alert))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&alert); return rc; } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } return rc; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; } } if (NULL == jobdat) { /* race condition - may not have been formed yet */ return ORTE_SUCCESS; } alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jobdat))) { ORTE_ERROR_LOG(rc); } FINAL_CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobdat->jobid))); /* remove all of this job's children from the global list - do not lock * the thread as we are already locked */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = next) { child = (orte_odls_child_t*)item; next = opal_list_get_next(item); if (jobdat->jobid == child->name->jobid) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jobdat->jobid); /* remove this job from our local job data since it is complete */ opal_list_remove_item(&orte_local_jobdata, &jobdat->super); OBJ_RELEASE(jobdat); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, 0, cbfunc, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } /* indicate that the job is complete */ return rc; } return ORTE_SUCCESS; }