void mca_oob_usock_component_lost_connection(int fd, short args, void *cbdata) { mca_oob_usock_peer_op_t *pop = (mca_oob_usock_peer_op_t*)cbdata; uint64_t ui64; int rc; opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s usock:lost connection called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer->name)); /* retrieve the peer's name */ memcpy(&ui64, (char*)&(pop->peer->name), sizeof(uint64_t)); /* mark the OOB's table that we can't reach it any more - for now, we don't * worry about shifting to another component. Eventually, we will want to push * this decision to the OOB so it can try other components and eventually error out */ if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, NULL))) { ORTE_ERROR_LOG(rc); } /* activate the proc state */ if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer->name)) { ORTE_ACTIVATE_PROC_STATE(&pop->peer->name, ORTE_PROC_STATE_LIFELINE_LOST); } else { ORTE_ACTIVATE_PROC_STATE(&pop->peer->name, ORTE_PROC_STATE_COMM_FAILED); } OBJ_RELEASE(pop); }
void mca_oob_tcp_component_failed_to_connect(int fd, short args, void *cbdata) { mca_oob_tcp_peer_op_t *pop = (mca_oob_tcp_peer_op_t*)cbdata; opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:failed_to_connect called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer)); /* if we are terminating, then don't attempt to reconnect */ if (orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) { OBJ_RELEASE(pop); return; } /* activate the proc state */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:failed_to_connect unable to reach peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer)); /* if this was a lifeline, then alert */ if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) { ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST); } else { ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED); } OBJ_RELEASE(pop); }
/* this function automatically gets periodically called * by the event library so we can check on the state * of the various orteds */ static void check_heartbeat(int fd, short dummy, void *arg) { int v; orte_proc_t *proc; opal_event_t *tmp = (opal_event_t*)arg; OPAL_OUTPUT_VERBOSE((3, orte_sensor_base.output, "%s sensor:check_heartbeat", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are aborting or shutting down, ignore this */ if (orte_abnormal_term_ordered || orte_finalizing || !orte_initialized) { OPAL_OUTPUT_VERBOSE((3, orte_sensor_base.output, "%s IGNORING CHECK abnorm_term %s fin %s init %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_abnormal_term_ordered ? "TRUE" : "FALSE", orte_finalizing ? "TRUE" : "FALSE", orte_initialized ? "TRUE" : "FALSE")); check_active = false; return; } for (v=0; v < daemons->procs->size; v++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, v))) { continue; } /* ignore myself */ if (proc->name.vpid == ORTE_PROC_MY_NAME->vpid) { continue; } if (ORTE_PROC_STATE_RUNNING != proc->state) { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sensor:heartbeat DAEMON %s IS NOT RUNNING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); continue; } if (0 == proc->beat) { /* no heartbeat recvd in last window */ OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s sensor:check_heartbeat FAILED for daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name))); ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_HEARTBEAT_FAILED); } else { OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, "%s HEARTBEAT DETECTED FOR %s: NUM BEATS %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), proc->beat)); } /* reset for next period */ proc->beat = 0; } /* reset the timer */ opal_event_evtimer_add(tmp, &check_time); }
void orte_rml_send_callback(int status, orte_process_name_t *peer, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { OBJ_RELEASE(buffer); if (ORTE_SUCCESS != status) { ORTE_ACTIVATE_PROC_STATE(peer, ORTE_PROC_STATE_UNABLE_TO_SEND_MSG); } }
static void process_state_monitor_cb(int fd, short args, void *cbdata) { orte_proc_t *proc = (orte_proc_t*)cbdata; if(proc->state >= ORTE_PROC_STATE_TERMINATED) { return; } else { proc->state = ORTE_PROC_STATE_TERM_NON_ZERO; ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_TERM_NON_ZERO); } }
void mca_oob_tcp_component_lost_connection(int fd, short args, void *cbdata) { mca_oob_tcp_peer_op_t *pop = (mca_oob_tcp_peer_op_t*)cbdata; uint64_t ui64; orte_oob_base_peer_t *bpr; int rc; opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:lost connection called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pop->peer)); /* if we are terminating, or recovery isn't enabled, then don't attempt to reconnect */ if (!orte_enable_recovery || orte_orteds_term_ordered || orte_finalizing || orte_abnormal_term_ordered) { goto cleanup; } /* Mark that we no longer support this peer */ memcpy(&ui64, (char*)&pop->peer, sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers, ui64, (void**)&bpr) || NULL == bpr) { bpr = OBJ_NEW(orte_oob_base_peer_t); } opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx); if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, NULL))) { ORTE_ERROR_LOG(rc); } cleanup: /* activate the proc state */ if (ORTE_SUCCESS != orte_routed.route_lost(&pop->peer)) { ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_LIFELINE_LOST); } else { ORTE_ACTIVATE_PROC_STATE(&pop->peer, ORTE_PROC_STATE_COMM_FAILED); } OBJ_RELEASE(pop); }
void mca_oob_tcp_component_no_route(int fd, short args, void *cbdata) { mca_oob_tcp_msg_error_t *mop = (mca_oob_tcp_msg_error_t*)cbdata; uint64_t ui64; int rc; orte_oob_base_peer_t *bpr; opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:no route called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mop->hop)); /* mark that we cannot reach this hop */ memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers, ui64, (void**)&bpr) || NULL == bpr) { bpr = OBJ_NEW(orte_oob_base_peer_t); } opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx); if (OPAL_SUCCESS != (rc = opal_hash_table_set_value_uint64(&orte_oob_base.peers, ui64, NULL))) { ORTE_ERROR_LOG(rc); } /* report the error back to the OOB and let it try other components * or declare a problem */ if (!orte_finalizing && !orte_abnormal_term_ordered) { /* if this was a lifeline, then alert */ if (ORTE_SUCCESS != orte_routed.route_lost(&mop->hop)) { ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_LIFELINE_LOST); } else { ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED); } } OBJ_RELEASE(mop); }
int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global_handle, int seq_num) { int ret, exit_status = ORTE_SUCCESS; orte_process_name_t loc_proc; orte_job_t *jdata; orte_sstore_base_handle_t prev_sstore_handle = ORTE_SSTORE_HANDLE_INVALID; /* JJH First determine if we can recover this way */ /* * Find the corresponding sstore handle */ prev_sstore_handle = orte_sstore_handle_last_stable; if( ORTE_SUCCESS != (ret = orte_sstore.request_restart_handle(&orte_sstore_handle_last_stable, NULL, global_handle, seq_num, NULL)) ) { ORTE_ERROR_LOG(ret); goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(jobid))) { exit_status = ORTE_ERR_NOT_FOUND; ORTE_ERROR_LOG(exit_status); goto cleanup; } /* * Start the recovery */ orte_snapc_base_has_recovered = false; loc_proc.jobid = jobid; loc_proc.vpid = 0; ORTE_ACTIVATE_PROC_STATE(&loc_proc, ORTE_PROC_STATE_KILLED_BY_CMD); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_CONTROL_RESTART); while( !orte_snapc_base_has_recovered ) { opal_progress(); } orte_sstore_handle_last_stable = prev_sstore_handle; cleanup: return exit_status; }
void mca_oob_ud_peer_lost (mca_oob_ud_peer_t *peer) { OPAL_THREAD_LOCK(&peer->peer_lock); if (true == peer->peer_available) { peer->peer_available = false; OPAL_OUTPUT_VERBOSE((10, mca_oob_base_output, "%s oob:ud:peer_lost lost connectivity to peer " "%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->peer_name))); /* inform the ERRMGR framework that we have lost a connection so * it can decide if this is important, what to do about it, etc. */ ORTE_ACTIVATE_PROC_STATE(&peer->peer_name, ORTE_PROC_STATE_COMM_FAILED); } OPAL_THREAD_UNLOCK(&peer->peer_lock); }
int pmix_server_abort_fn(opal_process_name_t *proc, void *server_object, int status, const char msg[], opal_list_t *procs_to_abort, opal_pmix_op_cbfunc_t cbfunc, void *cbdata) { orte_proc_t *p; if (NULL != server_object) { p = (orte_proc_t*)server_object; p->exit_code = status; } ORTE_UPDATE_EXIT_STATUS(status); ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_CALLED_ABORT); /* release the caller */ if (NULL != cbfunc) { cbfunc(OPAL_SUCCESS, cbdata); } return OPAL_SUCCESS; }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; int i; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ if (orte_orteds_term_ordered && 0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:base all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } cleanup: OBJ_RELEASE(caddy); }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; int i; char *rtmod; orte_process_name_t parent, target, *npptr; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get our "lifeline" routed module */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); if (pdata->state < ORTE_PROC_STATE_TERMINATED) { pdata->state = state; } if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* tell the PMIx subsystem to cleanup this client */ opal_pmix.server_deregister_client(proc, NULL, NULL); /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ if (orte_orteds_term_ordered && 0 == orte_routed.num_routes(rtmod)) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:base all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); /* if they requested notification upon completion, provide it */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) { /* notify_completion => notify the parent of the termination * of this child job. So get the parent jobid info */ npptr = &parent; if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) { /* notify everyone who asked for it */ target.jobid = jdata->jobid; target.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD); } else { target.jobid = jdata->jobid; target.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent); } } } else if (ORTE_PROC_STATE_TERMINATED < pdata->state && !orte_job_term_ordered) { /* if this was an abnormal term, notify the other procs of the termination */ parent.jobid = jdata->jobid; parent.vpid = ORTE_VPID_WILDCARD; _send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent); } } cleanup: OBJ_RELEASE(caddy); }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_proc_t *pptr, *proct; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; int i; int32_t i32, *i32ptr; OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_hnp: for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* could be a race condition */ goto cleanup; } pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); /* we MUST handle a communication failure before doing anything else * as it requires some special care to avoid normal termination issues * for local application procs */ if (ORTE_PROC_STATE_COMM_FAILED == state) { /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid != proc->jobid) { /* nope - ignore it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure to non-daemon proc - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is my own connection, ignore it */ if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure on my own connection - ignoring it", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* mark the daemon as gone */ ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE); /* if we have ordered orteds to terminate or abort * is in progress, record it */ if (orte_orteds_term_ordered || orte_abnormal_term_ordered) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: daemons terminating - recording daemon %s as gone", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); /* if all my routes and local children are gone, then terminate ourselves */ if (0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) { /* at least one is still alive */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: at least one proc (%s) still alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proct->name))); goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr_hnp: all routes and children gone - ordering exit", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: %d routes remain alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } goto cleanup; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s Comm failure: daemon %s - aborting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* record the first one to fail */ if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { /* output an error message so the user knows what happened */ orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name); /* mark the daemon job as failed */ jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); /* update our exit code */ ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* just in case the exit code hadn't been set, do it here - this * won't override any reported exit code */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE); } /* abort the system */ default_hnp_abort(jdata); goto cleanup; } /* update the proc state - can get multiple reports on a proc * depending on circumstances, so ensure we only do this once */ if (pptr->state < ORTE_PROC_STATE_TERMINATED) { pptr->state = state; } /* if we were ordered to terminate, mark this proc as dead and see if * any of our routes or local children remain alive - if not, then * terminate ourselves. */ if (orte_orteds_term_ordered) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { goto keep_going; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:hnp all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } } keep_going: /* ensure we record the failed proc properly so we can report * the error once we terminate */ switch (state) { case ORTE_PROC_STATE_KILLED_BY_CMD: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s killed by cmd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* we ordered this proc to die, so it isn't an abnormal termination * and we don't flag it as such */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } /* don't abort the job as this isn't an abnormal termination */ break; case ORTE_PROC_STATE_ABORTED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s aborted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_ABORTED_BY_SIG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s aborted by signal", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* track the number of non-zero exits */ i32 = 0; i32ptr = &i32; orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32); ++i32; orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32); if (orte_abort_non_zero_exit) { if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } } else { /* user requested we consider this normal termination */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } break; case ORTE_PROC_STATE_TERM_WO_SYNC: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s terminated without sync", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* now treat a special case - if the proc exit'd without a required * sync, it may have done so with a zero exit code. We want to ensure * that the user realizes there was an error, so in this -one- case, * we overwrite the process' exit code with the default error code */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_FAILED_TO_START: case ORTE_PROC_STATE_FAILED_TO_LAUNCH: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { if (ORTE_PROC_STATE_FAILED_TO_START) { jdata->state = ORTE_JOB_STATE_FAILED_TO_START; } else { jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH; } /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } /* if this was a daemon, report it */ if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* output a message indicating we failed to launch a daemon */ orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true); } break; case ORTE_PROC_STATE_CALLED_ABORT: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s called abort with exit code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), pptr->exit_code)); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_CALLED_ABORT; /* point to the first proc to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; case ORTE_PROC_STATE_TERM_NON_ZERO: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s exited with non-zero status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), pptr->exit_code)); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* track the number of non-zero exits */ i32 = 0; i32ptr = &i32; orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32); ++i32; orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32); if (orte_abort_non_zero_exit) { if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } } else { /* user requested we consider this normal termination */ if (jdata->num_terminated >= jdata->num_procs) { /* this job has terminated */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } break; case ORTE_PROC_STATE_HEARTBEAT_FAILED: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s heartbeat failed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; /* point to the first rank to cause the problem */ orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR); /* retain the object so it doesn't get free'd */ OBJ_RETAIN(pptr); ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED); ORTE_UPDATE_EXIT_STATUS(pptr->exit_code); /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } /* remove from dependent routes, if it is one */ orte_routed.route_lost(proc); break; case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: unable to send message to proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if this proc is one of my daemons, then we are truly * hosed - so just exit out */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); break; } if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) { /* abnormal termination - abort, but only do it once * to avoid creating a lot of confusion */ default_hnp_abort(jdata); } break; default: /* shouldn't get this, but terminate job if required */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:hnp: proc %s default error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } break; } /* if the waitpid fired, be sure to let the state machine know */ if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED); } cleanup: OBJ_RELEASE(caddy); }
int pmix_server_client_connected_fn(opal_process_name_t *proc, void *server_object) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_REGISTERED); return ORTE_SUCCESS; }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; char *rtmod; orte_std_cntr_t index; orte_job_map_t *map; orte_node_t *node; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, notify the HNP */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); /* Release the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process * Do this after we handle termination in case the IOF needs * to check to see if all procs from the job are actually terminated */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDALL); } if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); if (orte_orteds_term_ordered && 0 == orte_routed.num_routes(rtmod)) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted all routes gone but proc %s still alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pdata->name))); goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* cleanup the procs as these are gone */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } /* if this child is part of the job... */ if (pptr->name.jobid == jdata->jobid) { /* clear the entry in the local children */ opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(pptr); // maintain accounting } } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } /* tell the PMIx subsystem the job is complete */ if (NULL != opal_pmix.server_deregister_nspace) { opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL); } /* release the resources */ if (NULL != jdata->map) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:orted releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (pptr->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:orted releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pptr->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(pptr); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } /* cleanup the job info */ opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL); OBJ_RELEASE(jdata); } } cleanup: OBJ_RELEASE(caddy); }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } pdata->iof_complete = true; if (pdata->waitpid_recvd) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; pdata->waitpid_recvd = true; if (pdata->iof_complete) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ pdata->alive = false; pdata->state = state; if (pdata->local_proc) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } cleanup: OBJ_RELEASE(caddy); }
void orte_iof_orted_read_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; opal_buffer_t *buf=NULL; int rc; int32_t numbytes; orte_iof_proc_t *proct = (orte_iof_proc_t*)rev->proc; /* read up to the fragment size */ #if !defined(__WINDOWS__) numbytes = read(fd, data, sizeof(data)); #else { DWORD readed; HANDLE handle = (HANDLE)_get_osfhandle(fd); ReadFile(handle, data, sizeof(data), &readed, NULL); numbytes = (int)readed; } #endif /* !defined(__WINDOWS__) */ if (NULL == proct) { /* nothing we can do */ ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler read %d bytes from %s, fd %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&proct->name), fd)); if (numbytes <= 0) { if (0 > numbytes) { /* either we have a connection error or it was a non-blocking read */ if (EAGAIN == errno || EINTR == errno) { /* non-blocking, retry */ opal_event_add(rev->ev, 0); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proct->name), fd)); } /* numbytes must have been zero, so go down and close the fd etc */ goto CLEAN_RETURN; } /* see if the user wanted the output directed to files */ if (NULL != rev->sink) { /* output to the corresponding file */ orte_iof_base_write_output(&proct->name, rev->tag, data, numbytes, rev->sink->wev); } if (!proct->copy) { /* re-add the event */ opal_event_add(rev->ev, 0); return; } /* prep the buffer */ buf = OBJ_NEW(opal_buffer_t); /* pack the stream first - we do this so that flow control messages can * consist solely of the tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->tag, 1, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack name of process that gave us this data */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &proct->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack the data - only pack the #bytes we read! */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &data, numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* start non-blocking RML call to forward received data */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler sending %d bytes to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes)); orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, send_cb, NULL); /* re-add the event */ opal_event_add(rev->ev, 0); return; CLEAN_RETURN: /* must be an error, or zero bytes were read indicating that the * proc terminated this IOF channel - either way, release the * corresponding event. This deletes the read event and closes * the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { if( NULL != proct->revstdout ) { orte_iof_base_static_dump_output(proct->revstdout); OBJ_RELEASE(proct->revstdout); } } else if (rev->tag & ORTE_IOF_STDERR) { if( NULL != proct->revstderr ) { orte_iof_base_static_dump_output(proct->revstderr); OBJ_RELEASE(proct->revstderr); } } else if (rev->tag & ORTE_IOF_STDDIAG) { if( NULL != proct->revstddiag ) { orte_iof_base_static_dump_output(proct->revstddiag); OBJ_RELEASE(proct->revstddiag); } } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_orted_component.procs, &proct->super); ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } if (NULL != buf) { OBJ_RELEASE(buf); } return; }
void orte_iof_mrorted_read_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; opal_buffer_t *buf=NULL; int rc; int32_t numbytes; opal_list_item_t *item; orte_iof_proc_t *proct; orte_ns_cmp_bitmask_t mask; orte_job_t *jdata; orte_job_map_t *map; int i; bool write_out=false; orte_node_t *node; orte_proc_t *daemon; orte_jobid_t stdout_target, *jbptr; /* read up to the fragment size */ numbytes = read(fd, data, sizeof(data)); OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:mrorted:read handler read %d bytes from %s, fd %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name), fd)); if (numbytes <= 0) { if (0 > numbytes) { /* either we have a connection error or it was a non-blocking read */ if (EAGAIN == errno || EINTR == errno) { /* non-blocking, retry */ opal_event_add(rev->ev, 0); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:mrorted:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rev->name), fd)); } /* numbytes must have been zero, so go down and close the fd etc */ goto CLEAN_RETURN; } /* see if the user wanted the output directed to files */ if (NULL != orte_output_filename) { /* find the sink for this rank */ for (item = opal_list_get_first(&mca_iof_mr_orted_component.sinks); item != opal_list_get_end(&mca_iof_mr_orted_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target is set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID != sink->daemon.jobid) { continue; } /* if this sink isn't for output, ignore it */ if (ORTE_IOF_STDIN & sink->tag) { continue; } mask = ORTE_NS_CMP_ALL; /* is this the desired proc? */ if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) { /* output to the corresponding file */ orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); /* done */ break; } } } if (ORTE_IOF_STDOUT & rev->tag) { /* see if we need to forward this output */ stdout_target = ORTE_JOBID_INVALID; jbptr = &stdout_target; jdata = orte_get_job_data_object(rev->name.jobid); if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_STDOUT_TARGET, (void**)&jbptr, ORTE_JOBID)) { /* end of the chain - just output the info */ write_out = true; goto PROCESS; } /* it goes to the next job in the chain */ jdata = orte_get_job_data_object(stdout_target); map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } daemon = node->daemon; if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) { /* if it is me, then send the bytes down the stdin pipe * for every local proc (they are all on my proct list) */ for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs); item != opal_list_get_end(&mca_iof_mr_orted_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; if (proct->name.jobid == jdata->jobid) { if (NULL == proct->sink) { opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name)); continue; } orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev); } } } else { OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s sending %d bytes from stdout of %s to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name), ORTE_NAME_PRINT(&daemon->name))); /* send the data to the daemon so it can * write it to all local procs from this job */ send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes); } } } PROCESS: if (write_out) { /* prep the buffer */ buf = OBJ_NEW(opal_buffer_t); /* pack the stream first - we do this so that flow control messages can * consist solely of the tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->tag, 1, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack name of process that gave us this data */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack the data - only pack the #bytes we read! */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &data, numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* start non-blocking RML call to forward received data */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:mrorted:read handler sending %d bytes to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes)); orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, orte_rml_send_callback, NULL); } /* re-add the event */ opal_event_add(rev->ev, 0); return; CLEAN_RETURN: /* must be an error, or zero bytes were read indicating that the * proc terminated this IOF channel - either way, find this proc * on our list and clean up */ for (item = opal_list_get_first(&mca_iof_mr_orted_component.procs); item != opal_list_get_end(&mca_iof_mr_orted_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) { /* found it - release corresponding event. This deletes * the read event and closes the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { if( NULL != proct->revstdout ) { OBJ_RELEASE(proct->revstdout); } } else if (rev->tag & ORTE_IOF_STDERR) { if( NULL != proct->revstderr ) { OBJ_RELEASE(proct->revstderr); } } else if (rev->tag & ORTE_IOF_STDDIAG) { if( NULL != proct->revstddiag ) { OBJ_RELEASE(proct->revstddiag); } } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_mr_orted_component.procs, item); ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; } } if (NULL != buf) { OBJ_RELEASE(buf); } return; }
static void sample(orcm_sensor_sampler_t *sampler) { opal_pstats_t *stats; opal_node_stats_t *nstats; int rc, i; orte_proc_t *child; opal_buffer_t buf, *bptr; char *comp; OPAL_OUTPUT_VERBOSE((1, orcm_sensor_base_framework.framework_output, "sample:resusage sampling resource usage")); /* setup a buffer for our stats */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* pack our name */ comp = strdup("resusage"); if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &comp, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return; } free(comp); /* update stats on ourself and the node */ stats = OBJ_NEW(opal_pstats_t); nstats = OBJ_NEW(opal_node_stats_t); if (ORCM_SUCCESS != (rc = opal_pstat.query(orte_process_info.pid, stats, nstats))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(stats); OBJ_RELEASE(nstats); OBJ_DESTRUCT(&buf); return; } /* the stats framework can't know nodename or rank */ strncpy(stats->node, orte_process_info.nodename, (OPAL_PSTAT_MAX_STRING_LEN - 1)); stats->rank = ORTE_PROC_MY_NAME->vpid; #if 0 /* locally save the stats */ if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&my_proc->stats, stats))) { OBJ_RELEASE(st); } if (NULL != (nst = (opal_node_stats_t*)opal_ring_buffer_push(&my_node->stats, nstats))) { /* release the popped value */ OBJ_RELEASE(nst); } #endif /* pack them */ if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return; } if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &nstats, 1, OPAL_NODE_STAT))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return; } if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return; } /* loop through our children and update their stats */ if (NULL != orte_local_children) { for (i=0; i < orte_local_children->size; i++) { if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { continue; } if (0 == child->pid) { /* race condition */ continue; } stats = OBJ_NEW(opal_pstats_t); if (ORCM_SUCCESS != opal_pstat.query(child->pid, stats, NULL)) { /* may hit a race condition where the process has * terminated, so just ignore any error */ OBJ_RELEASE(stats); continue; } /* the stats framework can't know nodename or rank */ strncpy(stats->node, orte_process_info.nodename, (OPAL_PSTAT_MAX_STRING_LEN - 1)); stats->rank = child->name.vpid; #if 0 /* store it */ if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) { OBJ_RELEASE(st); } #endif /* pack them */ if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return; } } } /* xfer any data for transmission */ if (0 < buf.bytes_used) { bptr = &buf; if (OPAL_SUCCESS != (rc = opal_dss.pack(&sampler->bucket, &bptr, 1, OPAL_BUFFER))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&buf); return; } } OBJ_DESTRUCT(&buf); #if 0 /* are there any issues with node-level usage? */ nst = (opal_node_stats_t*)opal_ring_buffer_poke(&my_node->stats, -1); if (NULL != nst && 0.0 < mca_sensor_resusage_component.node_memory_limit) { OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output, "%s CHECKING NODE MEM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* compute the percentage of node memory in-use */ in_use = 1.0 - (nst->free_mem / nst->total_mem); OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output, "%s PERCENT USED: %f LIMIT: %f", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), in_use, mca_sensor_resusage_component.node_memory_limit)); if (mca_sensor_resusage_component.node_memory_limit <= in_use) { /* loop through our children and find the biggest hog */ hog = NULL; max_mem = 0.0; for (i=0; i < orte_local_children->size; i++) { if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (!ORTE_FLAG_TEST(child, ORTE_PROC_IS_ALIVE)) { continue; } if (0 == child->pid) { /* race condition */ continue; } if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) { continue; } OPAL_OUTPUT_VERBOSE((5, orcm_sensor_base_framework.framework_output, "%s PROC %s AT VSIZE %f", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), st->vsize)); if (max_mem < st->vsize) { hog = child; max_mem = st->vsize; } } if (NULL == hog) { /* if all children dead and we are still too big, * then we must be the culprit - abort */ OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output, "%s NO CHILD: COMMITTING SUICIDE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); orte_errmgr.abort(ORCM_ERR_MEM_LIMIT_EXCEEDED, NULL); } else { /* report the problem */ OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output, "%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&hog->name))); ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); } /* since we have ordered someone to die, we've done enough for this * time around - don't check proc limits as well */ return; } } /* check proc limits */ if (0.0 < mca_sensor_resusage_component.proc_memory_limit) { OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output, "%s CHECKING PROC MEM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* check my children first */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (!ORTE_FLAG_TEST(child, ORTE_PROC_IS_ALIVE)) { continue; } if (0 == child->pid) { /* race condition */ continue; } if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) { continue; } OPAL_OUTPUT_VERBOSE((5, orcm_sensor_base_framework.framework_output, "%s PROC %s AT VSIZE %f", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), st->vsize)); if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) { /* report the problem */ ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); } } } #endif }
static void proc_errors(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_proc_t *child, *ptr; opal_buffer_t *alert; orte_plm_cmd_flag_t cmd; int rc=ORTE_SUCCESS; int i; OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors process %s error state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors finalizing - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* if this was a failed comm, then see if it was to our * lifeline */ if (ORTE_PROC_STATE_LIFELINE_LOST == state || ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:orted lifeline lost - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* set our exit status */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { /* must already be complete */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } if (ORTE_PROC_STATE_COMM_FAILED == state) { /* if it is our own connection, ignore it */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } /* was it a daemon? */ if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { /* nope - we can't seem to trust that we will catch the waitpid * in this situation, so push this over to be handled as if * it were a waitpid trigger so we don't create a bunch of * duplicate code */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* leave the exit code alone - process this as a waitpid */ odls_base_default_wait_local_proc(child, NULL); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted daemon %s exited", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); /* if we are using static ports, then it is possible that the HNP * will not see this termination. So if the HNP didn't order us * to terminate, then we should ensure it knows */ if (orte_static_ports && !orte_orteds_term_ordered) { /* send an alert to the HNP */ alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* get the proc_t */ if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* set the exit code to reflect the problem */ child->exit_code = ORTE_ERR_COMM_FAILURE; /* pack only the data for this daemon - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the daemon's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting lost connection to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); /* continue on */ goto cleanup; } if (orte_orteds_term_ordered) { /* are any of my children still alive */ for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s errmgr:default:orted[%s(%d)] proc %s is alive", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, ORTE_NAME_PRINT(&child->name))); goto cleanup; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } else { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted not exiting, num_routes() == %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)orte_routed.num_routes())); } } /* if not, then we can continue */ goto cleanup; } if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); goto cleanup; } /* if this is not a local proc for this job, we can * ignore this call */ if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted:proc_errors proc is not local - ignoring error", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto cleanup; } OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted got state %s for proc %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_proc_state_to_str(state), ORTE_NAME_PRINT(proc))); if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { /* update the state */ child->state = state; /* report this as abnormal termination to the HNP, unless we already have * done so for this job */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(alert); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } if (ORTE_PROC_STATE_FAILED_TO_START == state || ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) { /* update the proc state */ child->state = state; /* count the proc as having "terminated" */ jdata->num_terminated++; /* leave the error report in this case to the * state machine, which will receive notice * when all local procs have attempted to start * so that we send a consolidated error report * back to the HNP */ goto cleanup; } if (ORTE_PROC_STATE_TERMINATED < state) { /* if we were ordered to terminate, see if * any of our routes or local children remain alive - if not, then * terminate ourselves. */ if (orte_orteds_term_ordered) { /* mark the child as no longer alive and update the counters, if necessary. * we have to do this here as we aren't going to send this to the state * machine, and we want to keep the bookkeeping accurate just in case */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE); } if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED); jdata->num_terminated++; } for (i=0; i < orte_local_children->size; i++) { if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) { goto keep_going; } } } /* if all my routes and children are gone, then terminate ourselves nicely (i.e., this is a normal termination) */ if (0 == orte_routed.num_routes()) { OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output, "%s errmgr:default:orted all routes gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); } /* no need to alert the HNP - we are already on our way out */ goto cleanup; } keep_going: /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away - but * only do this once! */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack only the data for this proc - have to start with the jobid * so the receiver can unpack it correctly */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } child->state = state; /* now pack the child's info */ if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&child->name), jdata->num_local_procs)); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we notified the HNP for this job so we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } /* if the proc has terminated, notify the state machine */ if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) && ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } goto cleanup; } /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { alert = OBJ_NEW(opal_buffer_t); /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); return; } /* pack the data for the job */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output, "%s errmgr:default_orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* remove all of this job's children from the global list */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (jdata->jobid == ptr->name.jobid) { opal_pointer_array_set_item(orte_local_children, i, NULL); OBJ_RELEASE(ptr); } } /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jdata->jobid); /* remove this job from our local job data since it is complete */ opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL); OBJ_RELEASE(jdata); /* send it */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } return; } cleanup: OBJ_RELEASE(caddy); }
/* this is the read handler for my own child procs. In this case, * the data is going nowhere - I just output it myself */ void orte_iof_hnp_read_local_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; int32_t numbytes; opal_list_item_t *item, *prev_item; orte_iof_proc_t *proct; int rc; orte_ns_cmp_bitmask_t mask; OPAL_THREAD_LOCK(&mca_iof_hnp_component.lock); /* read up to the fragment size */ #if !defined(__WINDOWS__) numbytes = read(fd, data, sizeof(data)); #else { DWORD readed; HANDLE handle = (HANDLE)_get_osfhandle(fd); ReadFile(handle, data, sizeof(data), &readed, NULL); numbytes = (int)readed; } #endif /* !defined(__WINDOWS__) */ if (numbytes < 0) { /* either we have a connection error or it was a non-blocking read */ /* non-blocking, retry */ if (EAGAIN == errno || EINTR == errno) { opal_event_add(rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:hnp:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rev->name), fd)); /* Un-recoverable error. Allow the code to flow as usual in order to * to send the zero bytes message up the stream, and then close the * file descriptor and delete the event. */ numbytes = 0; } /* is this read from our stdin? */ if (ORTE_IOF_STDIN & rev->tag) { /* The event has fired, so it's no longer active until we re-add it */ mca_iof_hnp_component.stdinev->active = false; /* if job termination has been ordered, just ignore the * data and delete the read event */ if (orte_job_term_ordered) { OBJ_RELEASE(mca_iof_hnp_component.stdinev); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* cycle through our list of sinks */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t* sink = (orte_iof_sink_t*)item; /* only look at stdin sinks */ if (!(ORTE_IOF_STDIN & sink->tag)) { continue; } mask = ORTE_NS_CMP_ALL; /* if the daemon is me, then this is a local sink */ if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, &sink->daemon)) { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s read %d bytes from stdin - writing to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name))); /* send the bytes down the pipe - we even send 0 byte events * down the pipe so it forces out any preceding data before * closing the output stream */ if (NULL != sink->wev) { if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev)) { /* getting too backed up - stop the read event for now if it is still active */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "buffer backed up - holding")); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } } } else { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s sending %d bytes from stdin to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&sink->daemon))); /* send the data to the daemon so it can * write it to the proc's fd - in this case, * we pass sink->name to indicate who is to * receive the data. If the connection closed, * numbytes will be zero so zero bytes will be * sent - this will tell the daemon to close * the fd for stdin to that proc */ if( ORTE_SUCCESS != (rc = orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &sink->name, ORTE_IOF_STDIN, data, numbytes))) { /* if the addressee is unknown, remove the sink from the list */ if( ORTE_ERR_ADDRESSEE_UNKNOWN == rc ) { prev_item = opal_list_get_prev(item); opal_list_remove_item(&mca_iof_hnp_component.sinks, item); OBJ_RELEASE(item); item = prev_item; } } } } /* if num_bytes was zero, or we read the last piece of the file, then we need to terminate the event */ if (0 == numbytes) { /* this will also close our stdin file descriptor */ OBJ_RELEASE(mca_iof_hnp_component.stdinev); } else { /* if we are looking at a tty, then we just go ahead and restart the * read event assuming we are not backgrounded */ if (orte_iof_hnp_stdin_check(fd)) { restart_stdin(fd, 0, NULL); } else { /* delay for awhile and then restart */ ORTE_TIMER_EVENT(0, 10000, restart_stdin, ORTE_INFO_PRI); } } /* nothing more to do */ OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* this must be output from one of my local procs - see * if anyone else has requested a copy of this info */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target isn't set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID == sink->daemon.jobid) { continue; } if ((sink->tag & rev->tag) && sink->name.jobid == rev->name.jobid && (ORTE_VPID_WILDCARD == sink->name.vpid || sink->name.vpid == rev->name.vpid)) { /* need to send the data to the remote endpoint - if * the connection closed, numbytes will be zero, so * the remote endpoint will know to close its local fd. * In this case, we pass rev->name to indicate who the * data came from. */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s sending data to tool %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&sink->daemon))); orte_iof_hnp_send_data_to_endpoint(&sink->daemon, &rev->name, rev->tag, data, numbytes); } } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s read %d bytes from %s of %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, (ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"), ORTE_NAME_PRINT(&rev->name))); if (0 == numbytes) { /* if we read 0 bytes from the stdout/err/diag, there is * nothing to output - find this proc on our list and * release the appropriate event. This will delete the * read event and close the file descriptor */ for (item = opal_list_get_first(&mca_iof_hnp_component.procs); item != opal_list_get_end(&mca_iof_hnp_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) { /* found it - release corresponding event. This deletes * the read event and closes the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { OBJ_RELEASE(proct->revstdout); } else if (rev->tag & ORTE_IOF_STDERR) { OBJ_RELEASE(proct->revstderr); } else if (rev->tag & ORTE_IOF_STDDIAG) { OBJ_RELEASE(proct->revstddiag); } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_hnp_component.procs, item); ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; } } OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; } /* see if the user wanted the output directed to files */ if (NULL != orte_output_filename) { /* find the sink for this rank */ for (item = opal_list_get_first(&mca_iof_hnp_component.sinks); item != opal_list_get_end(&mca_iof_hnp_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target is set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID != sink->daemon.jobid) { continue; } /* if this sink isn't for output, ignore it */ if (ORTE_IOF_STDIN & sink->tag) { continue; } /* is this the desired proc? */ mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) { /* output to the corresponding file */ orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); /* done */ break; } } } else { /* output this to our local output */ if (ORTE_IOF_STDOUT & rev->tag || orte_xml_output) { orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev); } else { orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev); } } /* re-add the event */ opal_event_add(rev->ev, 0); OPAL_THREAD_UNLOCK(&mca_iof_hnp_component.lock); return; }
/* process incoming messages in order of receipt */ void orte_plm_base_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { orte_plm_cmd_flag_t command; orte_std_cntr_t count; orte_jobid_t job; orte_job_t *jdata, *parent; opal_buffer_t *answer; orte_vpid_t vpid; orte_proc_t *proc; orte_proc_state_t state; orte_exit_code_t exit_code; int32_t rc=ORTE_SUCCESS, ret; orte_app_context_t *app, *child_app; orte_process_name_t name; pid_t pid; bool running; int i; char **env; char *prefix_dir; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive processing msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } switch (command) { case ORTE_PLM_LAUNCH_JOB_CMD: OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive job launch command from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* unpack the job object */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &count, ORTE_JOB))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } /* record the sender so we know who to respond to */ jdata->originator.jobid = sender->jobid; jdata->originator.vpid = sender->vpid; /* get the parent's job object */ if (NULL != (parent = orte_get_job_data_object(sender->jobid))) { /* if the prefix was set in the parent's job, we need to transfer * that prefix to the child's app_context so any further launch of * orteds can find the correct binary. There always has to be at * least one app_context in both parent and child, so we don't * need to check that here. However, be sure not to overwrite * the prefix if the user already provided it! */ app = (orte_app_context_t*)opal_pointer_array_get_item(parent->apps, 0); child_app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0); prefix_dir = NULL; if (orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING) && !orte_get_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, NULL, OPAL_STRING)) { orte_set_attribute(&child_app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, prefix_dir, OPAL_STRING); } if (NULL != prefix_dir) { free(prefix_dir); } } /* if the user asked to forward any envars, cycle through the app contexts * in the comm_spawn request and add them */ if (NULL != orte_forwarded_envars) { for (i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } env = opal_environ_merge(orte_forwarded_envars, app->env); opal_argv_free(app->env); app->env = env; } } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive adding hosts", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* process any add-hostfile and add-host options that were provided */ if (ORTE_SUCCESS != (rc = orte_ras_base_add_hosts(jdata))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } if (NULL != parent) { if (NULL == parent->bookmark) { /* find the sender's node in the job map */ if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(parent->procs, sender->vpid))) { /* set the bookmark so the child starts from that place - this means * that the first child process could be co-located with the proc * that called comm_spawn, assuming slots remain on that node. Otherwise, * the procs will start on the next available node */ jdata->bookmark = proc->node; } } else { jdata->bookmark = parent->bookmark; } } /* launch it */ OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive calling spawn", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { ORTE_ERROR_LOG(rc); goto ANSWER_LAUNCH; } break; ANSWER_LAUNCH: OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive - error on launch: %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc)); /* setup the response */ answer = OBJ_NEW(opal_buffer_t); /* pack the error code to be returned */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); } /* send the response back to the sender */ if (0 > (ret = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_PLM_PROXY, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(answer); } break; case ORTE_PLM_UPDATE_PROC_STATE: opal_output_verbose(5, orte_plm_base_framework.framework_output, "%s plm:base:receive update proc state command from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender)); count = 1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { opal_output_verbose(5, orte_plm_base_framework.framework_output, "%s plm:base:receive got update_proc_state for job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job)); name.jobid = job; running = false; /* get the job object */ jdata = orte_get_job_data_object(job); count = 1; while (ORTE_SUCCESS == (rc = opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID))) { if (ORTE_VPID_INVALID == vpid) { /* flag indicates that this job is complete - move on */ break; } name.vpid = vpid; /* unpack the pid */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &pid, &count, OPAL_PID))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* unpack the state */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &state, &count, ORTE_PROC_STATE))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } if (ORTE_PROC_STATE_RUNNING == state) { running = true; } /* unpack the exit code */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &exit_code, &count, ORTE_EXIT_CODE))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive got update_proc_state for vpid %lu state %s exit_code %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)vpid, orte_proc_state_to_str(state), (int)exit_code)); if (NULL != jdata) { /* get the proc data object */ if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } /* NEVER update the proc state before activating the state machine - let * the state cbfunc update it as it may need to compare this * state against the prior proc state */ proc->pid = pid; proc->exit_code = exit_code; ORTE_ACTIVATE_PROC_STATE(&name, state); } } /* record that we heard back from a daemon during app launch */ if (running && NULL != jdata) { jdata->num_daemons_reported++; if (orte_report_launch_progress) { if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REPORT_PROGRESS); } } } /* prepare for next job */ count = 1; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } break; case ORTE_PLM_REGISTERED_CMD: count=1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &job, &count, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto DEPART; } name.jobid = job; /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto DEPART; } count=1; while (ORTE_SUCCESS == opal_dss.unpack(buffer, &vpid, &count, ORTE_VPID)) { name.vpid = vpid; ORTE_ACTIVATE_PROC_STATE(&name, ORTE_PROC_STATE_REGISTERED); count=1; } break; default: ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS); rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS; break; } CLEANUP: if (ORTE_SUCCESS != rc) { goto DEPART; } DEPART: /* see if an error occurred - if so, wakeup the HNP so we can exit */ if (ORTE_PROC_IS_HNP && ORTE_SUCCESS != rc) { jdata = NULL; ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE); } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:receive done processing commands", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; int8_t flag; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (NULL == pdata) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_AS_MPI)) { flag = 1; } else { flag = 0; } if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &flag, 1, OPAL_INT8))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } } cleanup: OBJ_RELEASE(caddy); }
void mca_oob_tcp_component_hop_unknown(int fd, short args, void *cbdata) { mca_oob_tcp_msg_error_t *mop = (mca_oob_tcp_msg_error_t*)cbdata; uint64_t ui64; orte_rml_send_t *snd; orte_oob_base_peer_t *bpr; opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s tcp:unknown hop called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mop->hop)); if (orte_finalizing || orte_abnormal_term_ordered) { /* just ignore the problem */ OBJ_RELEASE(mop); return; } /* mark that this component cannot reach this hop */ memcpy(&ui64, (char*)&(mop->hop), sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers, ui64, (void**)&bpr) || NULL == bpr) { /* the overall OOB has no knowledge of this hop. Only * way this could happen is if the peer contacted us * via this component, and it wasn't entered into the * OOB framework hash table. We have no way of knowing * what to do next, so just output an error message and * abort */ opal_output(0, "%s ERROR: message to %s requires routing and the OOB has no knowledge of the reqd hop %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mop->snd->hdr.dst), ORTE_NAME_PRINT(&mop->hop)); ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED); OBJ_RELEASE(mop); return; } opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx); /* mark that this component cannot reach this destination either */ memcpy(&ui64, (char*)&(mop->snd->hdr.dst), sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&orte_oob_base.peers, ui64, (void**)&bpr) || NULL == bpr) { opal_output(0, "%s ERROR: message to %s requires routing and the OOB has no knowledge of this process", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&mop->snd->hdr.dst)); ORTE_ACTIVATE_PROC_STATE(&mop->hop, ORTE_PROC_STATE_COMM_FAILED); OBJ_RELEASE(mop); return; } opal_bitmap_clear_bit(&bpr->addressable, mca_oob_tcp_component.super.idx); /* post the message to the OOB so it can see * if another component can transfer it */ MCA_OOB_TCP_HDR_NTOH(&mop->snd->hdr); snd = OBJ_NEW(orte_rml_send_t); snd->dst = mop->snd->hdr.dst; snd->origin = mop->snd->hdr.origin; snd->tag = mop->snd->hdr.tag; snd->data = mop->snd->data; snd->count = mop->snd->hdr.nbytes; snd->cbfunc.iov = NULL; snd->cbdata = NULL; /* activate the OOB send state */ ORTE_OOB_SEND(snd); /* protect the data */ mop->snd->data = NULL; OBJ_RELEASE(mop); }
void orte_iof_orted_read_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; opal_buffer_t *buf=NULL; int rc; int32_t numbytes; opal_list_item_t *item; orte_iof_proc_t *proct; orte_ns_cmp_bitmask_t mask; /* read up to the fragment size */ #if !defined(__WINDOWS__) numbytes = read(fd, data, sizeof(data)); #else { DWORD readed; HANDLE handle = (HANDLE)_get_osfhandle(fd); ReadFile(handle, data, sizeof(data), &readed, NULL); numbytes = (int)readed; } #endif /* !defined(__WINDOWS__) */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler read %d bytes from %s, fd %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name), fd)); if (numbytes <= 0) { if (0 > numbytes) { /* either we have a connection error or it was a non-blocking read */ if (EAGAIN == errno || EINTR == errno) { /* non-blocking, retry */ opal_event_add(rev->ev, 0); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rev->name), fd)); } /* numbytes must have been zero, so go down and close the fd etc */ goto CLEAN_RETURN; } /* see if the user wanted the output directed to files */ if (NULL != orte_output_filename) { /* find the sink for this rank */ for (item = opal_list_get_first(&mca_iof_orted_component.sinks); item != opal_list_get_end(&mca_iof_orted_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target is set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID != sink->daemon.jobid) { continue; } /* if this sink isn't for output, ignore it */ if (ORTE_IOF_STDIN & sink->tag) { continue; } mask = ORTE_NS_CMP_ALL; /* is this the desired proc? */ if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) { /* output to the corresponding file */ orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); /* done */ break; } } goto RESTART; } /* prep the buffer */ buf = OBJ_NEW(opal_buffer_t); /* pack the stream first - we do this so that flow control messages can * consist solely of the tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->tag, 1, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack name of process that gave us this data */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack the data - only pack the #bytes we read! */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &data, numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* start non-blocking RML call to forward received data */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler sending %d bytes to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes)); orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, send_cb, NULL); RESTART: /* re-add the event */ opal_event_add(rev->ev, 0); return; CLEAN_RETURN: /* must be an error, or zero bytes were read indicating that the * proc terminated this IOF channel - either way, find this proc * on our list and clean up */ for (item = opal_list_get_first(&mca_iof_orted_component.procs); item != opal_list_get_end(&mca_iof_orted_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) { /* found it - release corresponding event. This deletes * the read event and closes the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { if( NULL != proct->revstdout ) { OBJ_RELEASE(proct->revstdout); } } else if (rev->tag & ORTE_IOF_STDERR) { if( NULL != proct->revstderr ) { OBJ_RELEASE(proct->revstderr); } } else if (rev->tag & ORTE_IOF_STDDIAG) { if( NULL != proct->revstddiag ) { OBJ_RELEASE(proct->revstddiag); } } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_orted_component.procs, item); ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; } } if (NULL != buf) { OBJ_RELEASE(buf); } return; }
/* this is the read handler for my own child procs and stdin */ void orte_iof_mrhnp_read_local_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; int32_t numbytes; opal_list_item_t *item; orte_iof_proc_t *proct; int i, j; orte_ns_cmp_bitmask_t mask; orte_job_t *jdata; orte_iof_job_t *iofjob; orte_node_t *node; orte_proc_t *daemon; orte_job_map_t *map; bool write_out=false; /* read up to the fragment size */ #if !defined(__WINDOWS__) numbytes = read(fd, data, sizeof(data)); #else { DWORD readed; HANDLE handle = (HANDLE)_get_osfhandle(fd); ReadFile(handle, data, sizeof(data), &readed, NULL); numbytes = (int)readed; } #endif /* !defined(__WINDOWS__) */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:mrhnp:read handler read %d bytes from %s:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name), fd)); if (numbytes < 0) { /* either we have a connection error or it was a non-blocking read */ /* non-blocking, retry */ if (EAGAIN == errno || EINTR == errno) { opal_event_add(rev->ev, 0); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s iof:mrhnp:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rev->name), fd)); /* Un-recoverable error. Allow the code to flow as usual in order to * to send the zero bytes message up the stream, and then close the * file descriptor and delete the event. */ numbytes = 0; } /* if job termination has been ordered, just ignore the * data and delete the stdin read event, if that is what fired */ if (orte_job_term_ordered) { if (ORTE_IOF_STDIN & rev->tag) { OBJ_RELEASE(mca_iof_mr_hnp_component.stdinev); } return; } if (ORTE_IOF_STDIN & rev->tag) { /* The event has fired, so it's no longer active until we * re-add it */ mca_iof_mr_hnp_component.stdinev->active = false; /* if this was read from my stdin, I need to send this input to all * daemons who host mapper procs */ for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) { if (NULL == (iofjob = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) { continue; } jdata = iofjob->jdata; OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s read %d bytes from stdin - writing to job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_JOBID_PRINT(jdata->jobid))); map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } daemon = node->daemon; if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) { /* if it is me, then send the bytes down the stdin pipe * for every local proc (they are all on my proct list) - we even send 0 byte events * down the pipe so it forces out any preceding data before * closing the output stream. We add a 0 byte message if * numbytes < sizeof(data) as this means the chunk we read * was the end of the file. */ for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; if (proct->name.jobid == jdata->jobid) { if (NULL == proct->sink) { opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name)); continue; } if (ORTE_IOF_MAX_INPUT_BUFFERS < orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev)) { /* getting too backed up - stop the read event for now if it is still active */ if (mca_iof_mr_hnp_component.stdinev->active) { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "buffer backed up - holding")); mca_iof_mr_hnp_component.stdinev->active = false; } return; } if (0 < numbytes && numbytes < (int)sizeof(data)) { /* need to write a 0-byte event to clear the stream and close it */ orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, 0, proct->sink->wev); proct->sink = NULL; } } } } else { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s sending %d bytes from stdin to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&daemon->name))); /* send the data to the daemon so it can * write it to all local procs from this job. * If the connection closed, * numbytes will be zero so zero bytes will be * sent - this will tell the daemon to close * the fd for stdin to that proc */ send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes); if (0 < numbytes && numbytes < (int)sizeof(data)) { /* need to send a 0-byte message to clear the stream and close it */ send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, 0); } } } } /* if num_bytes was zero, then we need to terminate the event */ if (0 == numbytes || numbytes < (int)sizeof(data)) { /* this will also close our stdin file descriptor */ if (NULL != mca_iof_mr_hnp_component.stdinev) { OBJ_RELEASE(mca_iof_mr_hnp_component.stdinev); } } else { /* if we are looking at a tty, then we just go ahead and restart the * read event assuming we are not backgrounded */ if (orte_iof_mrhnp_stdin_check(fd)) { restart_stdin(fd, 0, NULL); } else { /* delay for awhile and then restart */ ORTE_TIMER_EVENT(0, 10000, restart_stdin, ORTE_INFO_PRI); } } return; } if (ORTE_IOF_STDOUT & rev->tag && 0 < numbytes) { /* see if we need to forward this output */ jdata = orte_get_job_data_object(rev->name.jobid); if (ORTE_JOBID_INVALID == jdata->stdout_target) { /* end of the chain - just output the info */ write_out = true; goto PROCESS; } /* it goes to the next job in the chain */ jdata = orte_get_job_data_object(jdata->stdout_target); map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } daemon = node->daemon; if (daemon->name.vpid == ORTE_PROC_MY_NAME->vpid) { /* if it is me, then send the bytes down the stdin pipe * for every local proc (they are all on my proct list) */ for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; if (proct->name.jobid == jdata->jobid) { if (NULL == proct->sink) { opal_output(0, "NULL SINK FOR PROC %s", ORTE_NAME_PRINT(&proct->name)); continue; } orte_iof_base_write_output(&proct->name, ORTE_IOF_STDIN, data, numbytes, proct->sink->wev); } } } else { OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s sending %d bytes from stdout of %s to daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name), ORTE_NAME_PRINT(&daemon->name))); /* send the data to the daemon so it can * write it to all local procs from this job */ send_data(&daemon->name, ORTE_IOF_STDIN, jdata->jobid, data, numbytes); } } } PROCESS: OPAL_OUTPUT_VERBOSE((1, orte_iof_base.iof_output, "%s read %d bytes from %s of %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, (ORTE_IOF_STDOUT & rev->tag) ? "stdout" : ((ORTE_IOF_STDERR & rev->tag) ? "stderr" : "stddiag"), ORTE_NAME_PRINT(&rev->name))); if (0 == numbytes) { /* if we read 0 bytes from the stdout/err/diag, find this proc * on our list and * release the appropriate event. This will delete the * read event and close the file descriptor */ for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs); item != opal_list_get_end(&mca_iof_mr_hnp_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) { /* found it - release corresponding event. This deletes * the read event and closes the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { OBJ_RELEASE(proct->revstdout); } else if (rev->tag & ORTE_IOF_STDERR) { OBJ_RELEASE(proct->revstderr); } else if (rev->tag & ORTE_IOF_STDDIAG) { OBJ_RELEASE(proct->revstddiag); } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_mr_hnp_component.procs, item); ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; } } return; } else { /* output this to our local output */ if (ORTE_IOF_STDOUT & rev->tag) { if (write_out) { orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stdout->wev); } } else { orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, orte_iof_base.iof_write_stderr->wev); } } /* re-add the event */ opal_event_add(rev->ev, 0); return; }
static void heartbeat_with_AM_cb(int fd, short event, void *data) { int i, rc; orte_job_t *jdata = (orte_job_t*)data; orte_job_t* daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); /* 1. create heartbeat request msg */ /* message HeartbeatRequestProto { } */ struct pbc_wmessage* request_msg = pbc_wmessage_new(orte_hdclient_pb_env, "HeartbeatRequestProto"); if (!request_msg) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: failed to create request_msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } /* 2. send heartbeat request msg */ rc = orte_hdclient_send_message_and_delete(request_msg, HAMSTER_MSG_HEARTBEAT); if (rc != 0) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: error happened when send request_msg to AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } /* 3. recv response and parse the msg*/ /* message HeartbeatResponseProto { repeated ProcessStatusProto completed_processes = 1; } message ProcessStatusProto { optional ProcessNameProto name = 1; optional ProcessStateProto state = 2; optional int32 exit_value = 3; } enum ProcessStateProto { RUNNING = 1; COMPLETED = 2; } message ProcessNameProto { optional int32 jobid = 1; optional int32 vpid = 2; } */ struct pbc_rmessage* response_msg = orte_hdclient_recv_message("HeartbeatResponseProto"); if (!response_msg) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: error happened when recv HeartbeatResponseProto msg from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto cleanup; } int n = pbc_rmessage_size(response_msg, "completed_processes"); if (n < 0) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: got n(=%d) < 0, please check", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n); goto cleanup; } for (i = 0; i < n; i++) { struct pbc_rmessage* completed_procs_msg = pbc_rmessage_message(response_msg, "completed_processes", i); if (!completed_procs_msg) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: error when parse returned completed_procs_msg from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto cleanup; } struct pbc_rmessage* proc_name_msg = pbc_rmessage_message(completed_procs_msg, "name", 0); if (!proc_name_msg) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: error when parse proc_name_msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto cleanup; } uint32_t local_jobid = pbc_rmessage_integer(proc_name_msg, "jobid", 0, NULL); uint32_t vpid = pbc_rmessage_integer(proc_name_msg, "vpid", 0, NULL); uint32_t exit_value = pbc_rmessage_integer(completed_procs_msg, "exit_value", 0, NULL); /* next, we will modify proc's state */ orte_job_t* tmp_jdata = (orte_job_t*) opal_pointer_array_get_item(orte_job_data, local_jobid); orte_proc_t* proc = (orte_proc_t*) opal_pointer_array_get_item(tmp_jdata->procs, vpid); if (tmp_jdata->jobid == jdata->jobid) { num_completed_jdata_procs++; } if (exit_value == 0) { proc->state = ORTE_PROC_STATE_TERMINATED; } /* if this process is already terminated, just skip over */ if (proc->state >= ORTE_PROC_STATE_TERMINATED) { continue; } if (exit_value == -1000 || exit_value == -100 || exit_value == -101) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb proc failed to start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_ERROR_LOG(ORTE_ERROR); proc->state = ORTE_PROC_STATE_FAILED_TO_START; ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); } else { /* here, means currently the proc's state < ORTE_PROC_STATE_TERMINATED, * however, from AM's heartbeat response, we got the proc's container is terminated, * to solve this dilemma , we set a timer event to reconfirm this proc's state, */ opal_event_t *ev = NULL; ev = (opal_event_t*) malloc(sizeof(opal_event_t)); struct timeval delay; delay.tv_sec = 15; delay.tv_usec = 0; opal_event_evtimer_set(orte_event_base, ev, process_state_monitor_cb, proc); opal_event_evtimer_add(ev, &delay); } } cleanup: if (response_msg) { pbc_rmessage_delete(response_msg); } if (num_completed_jdata_procs == jdata->num_procs) { /* * all procs are completed, send finish request to AM, * modify job state to ORTE_JOB_STATE_TERMINATED */ jdata->state = ORTE_JOB_STATE_TERMINATED; finish_app_master(0 == orte_exit_status); return; } else { /* next heartbeat */ opal_event_t *ev = NULL; ev = (opal_event_t*) malloc(sizeof(opal_event_t)); struct timeval delay; delay.tv_sec = 1; delay.tv_usec = 0; opal_event_evtimer_set(orte_event_base, ev, heartbeat_with_AM_cb, jdata); opal_event_evtimer_add(ev, &delay); } }