static void signal_trap(int fd, short flags, void *arg) { if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ if (forcibly_die) { opal_output(0, "%s forcibly exiting upon signal %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strsignal(fd)); /* exit with a non-zero status */ exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } opal_output(0, "orcm: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n"); forcibly_die = true; return; } /* set the global abnormal exit flag so we know not to * use the standard xcast for terminating orteds */ orte_abnormal_term_ordered = true; /* ensure that the forwarding of stdin stops */ orte_job_term_ordered = true; /* if we are the scheduler and user directed, order any associated orcm daemons to die */ if (ORCM_PROC_IS_SCHEDULER && orcm_sched_kill_dvm && NULL != orte_plm.terminate_orteds) { orte_plm.terminate_orteds(); } ORTE_TIMER_EVENT(0, 0, orcm_just_quit); }
/***************** * Local Functions *****************/ static void default_hnp_abort(orte_job_t *jdata) { int rc; /* if we are already in progress, then ignore this call */ if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); return; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:default_hnp: abort called on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* the job aborted - turn off any sensors on this job */ orte_sensor.stop(jdata->jobid); /* set control params to indicate we are terminating */ orte_job_term_ordered = true; orte_enable_recovery = false; /* if it is the daemon job that aborted, then we need * to flag an abnormal term - otherwise, just abort * the job cleanly */ if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { orte_abnormal_term_ordered = true; } if (0 < jdata->num_non_zero_exit) { /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "%s job %s terminated normally, but %d %s. Per user-direction, the job has been aborted.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), jdata->num_non_zero_exit, (1 == jdata->num_non_zero_exit) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:default_hnp: ordering orted termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* tell the plm to terminate the orteds - they will automatically * kill their local procs */ if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { ORTE_ERROR_LOG(rc); } }
/***************** * Local Functions *****************/ static void default_hnp_abort(orte_job_t *jdata) { int rc; int32_t i32, *i32ptr; /* if we are already in progress, then ignore this call */ if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); return; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_hnp: abort called on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* set control params to indicate we are terminating */ orte_job_term_ordered = true; orte_enable_recovery = false; /* if it is the daemon job that aborted, then we need * to flag an abnormal term - otherwise, just abort * the job cleanly */ if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) { orte_abnormal_term_ordered = true; } i32 = 0; i32ptr = &i32; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32)) { /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "%s job %s terminated normally, but %d %s. Per user-direction, the job has been aborted.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), i32, (1 == i32) ? "process returned\na non-zero exit code" : "processes returned\nnon-zero exit codes"); } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output, "%s errmgr:default_hnp: ordering orted termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* tell the plm to terminate the orteds - they will automatically * kill their local procs */ if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { ORTE_ERROR_LOG(rc); } }
int orte_finalize(void) { int rc; --orte_initialized; if (0 != orte_initialized) { /* check for mismatched calls */ if (0 > orte_initialized) { opal_output(0, "%s MISMATCHED CALLS TO ORTE FINALIZE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } return ORTE_ERROR; } /* protect against multiple calls */ if (opal_atomic_trylock(&orte_finalize_lock)) { return ORTE_SUCCESS; } /* flag that we are finalizing */ orte_finalizing = true; if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { /* stop listening for connections - will * be ignored if no listeners were registered */ orte_stop_listening(); } /* flush the show_help system */ orte_show_help_finalize(); /* call the finalize function for this environment */ if (ORTE_SUCCESS != (rc = orte_ess.finalize())) { return rc; } /* close the ess itself */ (void) mca_base_framework_close(&orte_ess_base_framework); /* cleanup the process info */ orte_proc_info_finalize(); /* Close the general debug stream */ opal_output_close(orte_debug_output); /* finalize the opal utilities */ rc = opal_finalize(); return rc; }
static void clean_abort(int fd, short flags, void *arg) { /* if we have already ordered this once, don't keep * doing it to avoid race conditions */ if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ if (forcibly_die) { /* kill any local procs */ orte_odls.kill_local_procs(NULL); /* whack any lingering session directory files from our jobs */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* cleanup our data server */ orte_data_server_finalize(); /* exit with a non-zero status */ exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } fprintf(stderr, "%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n", orte_basename); forcibly_die = true; /* reset the event */ opal_event_add(&term_handler, NULL); return; } /* ensure we exit with a non-zero status */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* ensure that the forwarding of stdin stops */ orte_job_term_ordered = true; /* tell us to be quiet - hey, the user killed us with a ctrl-c, * so need to tell them that! */ orte_execute_quiet = true; if (!orte_never_launched) { /* cleanup our data server */ orte_data_server_finalize(); } /* We are in an event handler; the job completed procedure will delete the signal handler that is currently running (which is a Bad Thing), so we can't call it directly. Instead, we have to exit this handler and setup to call job_completed() after this. */ orte_plm.terminate_orteds();; }
void orte_trigger_event(orte_trigger_event_t *trig) { int data=1; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s calling %s trigger", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), trig->name)); if (opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */ return; } send(trig->channel, (const char*)&data, sizeof(int), 0); closesocket(trig->channel); opal_progress(); }
static int atomic_spinlock_test(opal_atomic_lock_t *lock, int count, int id) { int i; for (i = 0 ; i < count ; ++i) { opal_atomic_lock(lock); if (atomic_verbose) { printf("id %03d has the lock (lock)\n", id); } opal_atomic_unlock(lock); while (opal_atomic_trylock(lock)) { ; } if (atomic_verbose) { printf("id %03d has the lock (trylock)\n", id); } opal_atomic_unlock(lock); } return 0; }
void orte_trigger_event(orte_trigger_event_t *trig) { int data=1; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s calling %s trigger", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), trig->name)); /* if we already fired it, don't do it again - this automatically * records that we did fire it */ if (opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */ return; } write(trig->channel, &data, sizeof(int)); close(trig->channel); opal_progress(); }
/* * This function gets called by the PLM when an orted notifies us that * a job failed to start. * Various components will follow their own strategy for dealing with * this situation. For this component, we simply kill the job. */ void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code) { int rc; OPAL_TRACE(1); /* if we are already in progress, then ignore this call */ if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, "%s errmgr:default: abort in progress, ignoring incomplete start on job %s with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), exit_code)); return; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, "%s errmgr:default: job %s reported incomplete start with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), exit_code)); orte_job_term_ordered = true; /* tell the plm to terminate all jobs */ if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) { ORTE_ERROR_LOG(rc); } /* set the exit status, just in case whomever called us failed * to do so - it can only be done once, so we are protected * from overwriting it */ ORTE_UPDATE_EXIT_STATUS(exit_code); /* wakeup orterun so we can exit */ orte_trigger_event(&orte_exit); }
void orte_quit(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; /* cleanup */ if (NULL != caddy) { OBJ_RELEASE(caddy); } /* check one-time lock to protect against "bounce" */ if (opal_atomic_trylock(&orte_quit_lock)) { /* returns 1 if already locked */ return; } /* if we are the hnp and haven't already reported it, then * report any errors */ if (ORTE_PROC_IS_HNP && !errors_reported) { if (0 != orte_exit_status && !orte_execute_quiet) { errors_reported = true; /* abnormal termination of some kind */ dump_aborted_procs(); /* If we showed more abort messages than were allowed, show a followup message here */ if (num_failed_start > 1) { if (orte_xml_output) { fprintf(orte_xml_fp, "<stderr>"); } fprintf(orte_xml_fp, "%d total process%s failed to start", num_failed_start, ((num_failed_start > 1) ? "es" : "")); if (orte_xml_output) { fprintf(orte_xml_fp, "
</stderr>"); } fprintf(orte_xml_fp, "\n"); } if (num_aborted > 1) { if (orte_xml_output) { fprintf(orte_xml_fp, "<stderr>"); } fprintf(orte_xml_fp, "%d total process%s aborted", num_aborted, ((num_aborted > 1) ? "es" : "")); if (orte_xml_output) { fprintf(orte_xml_fp, "
</stderr>"); } fprintf(orte_xml_fp, "\n"); } if (num_killed > 1) { if (orte_xml_output) { fprintf(orte_xml_fp, "<stderr>"); } fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)", num_killed, ((num_killed > 1) ? "es" : ""), orte_basename); if (orte_xml_output) { fprintf(orte_xml_fp, "
</stderr>"); } fprintf(orte_xml_fp, "\n"); } } } /* flag that the event lib should no longer be looped * so we will exit */ orte_event_base_active = false; }
/* * This function gets called by the PLM when an orted notifies us * that a process has aborted * Various components will follow their own strategy for dealing with * this situation. For this component, we simply kill the job. */ void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code) { int rc; orte_job_t **jobs; orte_std_cntr_t i; OPAL_TRACE(1); /* if we are already in progress, then ignore this call */ if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, "%s errmgr:default: abort in progress, ignoring proc %s aborted with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), exit_code)); return; } OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, "%s errmgr:default: proc %s aborting with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), exit_code)); orte_job_term_ordered = true; /* indicate that all jobs other than the one containing this * proc have been orted to abort - this is necessary to avoid * duplicate ordering of "abort". * * NOTE: be sure to not include the 0 job data location as this * contains the daemons! */ jobs = (orte_job_t**)orte_job_data->addr; for (i=1; i < orte_job_data->size; i++) { /* the array is left justfied, so we can quit once * we see a NULL */ if (NULL == jobs[i]) { break; } if (ORTE_JOB_STATE_ABORTED != jobs[i]->state && ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state && ORTE_JOB_STATE_ABORTED_WO_SYNC != jobs[i]->state) { jobs[i]->state = ORTE_JOB_STATE_ABORT_ORDERED; } } /* tell the plm to terminate all jobs */ if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) { ORTE_ERROR_LOG(rc); } /* set the exit status, just in case whomever called us failed * to do so - it can only be done once, so we are protected * from overwriting it */ ORTE_UPDATE_EXIT_STATUS(exit_code); /* wakeup orterun so we can exit */ orte_trigger_event(&orte_exit); }
/* * Progress the event library and any functions that have registered to * be called. We don't propogate errors from the progress functions, * so no action is taken if they return failures. The functions are * expected to return the number of events progressed, to determine * whether or not we should call sched_yield() during MPI progress. * This is only losely tracked, as an error return can cause the number * of progressed events to appear lower than it actually is. We don't * care, as the cost of that happening is far outweighed by the cost * of the if checks (they were resulting in bad pipe stalling behavior) */ void opal_progress(void) { size_t i; int events = 0; if( opal_progress_event_flag != 0 ) { #if (OMPI_ENABLE_PROGRESS_THREADS == 0) && OPAL_HAVE_WORKING_EVENTOPS #if OPAL_PROGRESS_USE_TIMERS #if OPAL_TIMER_USEC_NATIVE opal_timer_t now = opal_timer_base_get_usec(); #else opal_timer_t now = opal_timer_base_get_cycles(); #endif /* OPAL_TIMER_USEC_NATIVE */ /* trip the event library if we've reached our tick rate and we are enabled */ if (now - event_progress_last_time > event_progress_delta ) { #if OMPI_HAVE_THREAD_SUPPORT if (opal_atomic_trylock(&progress_lock)) { #endif /* OMPI_HAVE_THREAD_SUPPORT */ event_progress_last_time = (event_num_mpi_users > 0) ? now - event_progress_delta : now; events += opal_event_loop(opal_progress_event_flag); #if OMPI_HAVE_THREAD_SUPPORT opal_atomic_unlock(&progress_lock); } #endif /* OMPI_HAVE_THREAD_SUPPORT */ } #else /* OPAL_PROGRESS_USE_TIMERS */ /* trip the event library if we've reached our tick rate and we are enabled */ if (OPAL_THREAD_ADD32(&event_progress_counter, -1) <= 0 ) { #if OMPI_HAVE_THREAD_SUPPORT if (opal_atomic_trylock(&progress_lock)) { #endif /* OMPI_HAVE_THREAD_SUPPORT */ event_progress_counter = (event_num_mpi_users > 0) ? 0 : event_progress_delta; events += opal_event_loop(opal_progress_event_flag); #if OMPI_HAVE_THREAD_SUPPORT opal_atomic_unlock(&progress_lock); } #endif /* OMPI_HAVE_THREAD_SUPPORT */ } #endif /* OPAL_PROGRESS_USE_TIMERS */ #endif /* OMPI_ENABLE_PROGRESS_THREADS == 0 && OPAL_HAVE_WORKING_EVENTOPS */ } /* progress all registered callbacks */ for (i = 0 ; i < callbacks_len ; ++i) { events += (callbacks[i])(); } #if defined(__WINDOWS__) || defined(HAVE_SCHED_YIELD) if (call_yield && events <= 0) { /* If there is nothing to do - yield the processor - otherwise * we could consume the processor for the entire time slice. If * the processor is oversubscribed - this will result in a best-case * latency equivalent to the time-slice. */ #if defined(__WINDOWS__) SwitchToThread(); #else sched_yield(); #endif /* defined(__WINDOWS__) */ } #endif /* defined(__WINDOWS__) || defined(HAVE_SCHED_YIELD) */ }