Beispiel #1
0
static void signal_trap(int fd, short flags, void *arg)
{
    if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
        if (forcibly_die) {
            opal_output(0, "%s forcibly exiting upon signal %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), strsignal(fd));
            /* exit with a non-zero status */
            exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
        }
        opal_output(0, "orcm: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n");
        forcibly_die = true;
        return;
    }
    
    /* set the global abnormal exit flag so we know not to
     * use the standard xcast for terminating orteds
     */
    orte_abnormal_term_ordered = true;
    /* ensure that the forwarding of stdin stops */
    orte_job_term_ordered = true;
    
    /* if we are the scheduler and user directed, order any associated orcm daemons to die */
    if (ORCM_PROC_IS_SCHEDULER && orcm_sched_kill_dvm &&
        NULL != orte_plm.terminate_orteds) {
        orte_plm.terminate_orteds();
    }

    ORTE_TIMER_EVENT(0, 0, orcm_just_quit);
}
Beispiel #2
0
/*****************
 * Local Functions
 *****************/
static void default_hnp_abort(orte_job_t *jdata)
{
    int rc;

    /* if we are already in progress, then ignore this call */
    if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
        OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                             "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));
        return;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                         "%s errmgr:default_hnp: abort called on job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid)));

    /* the job aborted - turn off any sensors on this job */
    orte_sensor.stop(jdata->jobid);

    /* set control params to indicate we are terminating */
    orte_job_term_ordered = true;
    orte_enable_recovery = false;

    /* if it is the daemon job that aborted, then we need
     * to flag an abnormal term - otherwise, just abort
     * the job cleanly
     */
    if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
        orte_abnormal_term_ordered = true;
    }

    if (0 < jdata->num_non_zero_exit) {
        /* warn user */
        opal_output(orte_clean_output,
                    "-------------------------------------------------------\n"
                    "%s job %s terminated normally, but %d %s. Per user-direction, the job has been aborted.\n"
                    "-------------------------------------------------------",
                    (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child",
                    (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
                    jdata->num_non_zero_exit,
                    (1 == jdata->num_non_zero_exit) ? "process returned\na non-zero exit code." :
                    "processes returned\nnon-zero exit codes.");
    }

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
                         "%s errmgr:default_hnp: ordering orted termination",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* tell the plm to terminate the orteds - they will automatically
     * kill their local procs
     */
    if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
        ORTE_ERROR_LOG(rc);
    }
}
Beispiel #3
0
/*****************
 * Local Functions
 *****************/
static void default_hnp_abort(orte_job_t *jdata)
{
    int rc;
    int32_t i32, *i32ptr;

    /* if we are already in progress, then ignore this call */
    if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
        OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_hnp: abort in progress, ignoring abort on job %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));
        return;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_hnp: abort called on job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid)));

    /* set control params to indicate we are terminating */
    orte_job_term_ordered = true;
    orte_enable_recovery = false;

    /* if it is the daemon job that aborted, then we need
     * to flag an abnormal term - otherwise, just abort
     * the job cleanly
     */
    if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
        orte_abnormal_term_ordered = true;
    }

    i32 = 0;
    i32ptr = &i32;
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32)) {
        /* warn user */
        opal_output(orte_clean_output,
                    "-------------------------------------------------------\n"
                    "%s job %s terminated normally, but %d %s. Per user-direction, the job has been aborted.\n"
                    "-------------------------------------------------------",
                    (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "Primary" : "Child",
                    (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
                    i32, (1 == i32) ? "process returned\na non-zero exit code" :
                    "processes returned\nnon-zero exit codes");
    }

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_hnp: ordering orted termination",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* tell the plm to terminate the orteds - they will automatically
     * kill their local procs
     */
    if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
        ORTE_ERROR_LOG(rc);
    }
}
Beispiel #4
0
int orte_finalize(void)
{
    int rc;

    --orte_initialized;
    if (0 != orte_initialized) {
        /* check for mismatched calls */
        if (0 > orte_initialized) {
            opal_output(0, "%s MISMATCHED CALLS TO ORTE FINALIZE",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        }
        return ORTE_ERROR;
    }

    /* protect against multiple calls */
    if (opal_atomic_trylock(&orte_finalize_lock)) {
        return ORTE_SUCCESS;
    }
    
    /* flag that we are finalizing */
    orte_finalizing = true;

    if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
        /* stop listening for connections - will
         * be ignored if no listeners were registered */
        orte_stop_listening();
    }
    
    /* flush the show_help system */
    orte_show_help_finalize();

    /* call the finalize function for this environment */
    if (ORTE_SUCCESS != (rc = orte_ess.finalize())) {
        return rc;
    }

    /* close the ess itself */
    (void) mca_base_framework_close(&orte_ess_base_framework);

    /* cleanup the process info */
    orte_proc_info_finalize();

    /* Close the general debug stream */
    opal_output_close(orte_debug_output);
    
    /* finalize the opal utilities */
    rc = opal_finalize();

    return rc;
}
Beispiel #5
0
static void clean_abort(int fd, short flags, void *arg)
{
    /* if we have already ordered this once, don't keep
     * doing it to avoid race conditions
     */
    if (opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
        if (forcibly_die) {
            /* kill any local procs */
            orte_odls.kill_local_procs(NULL);
            
            /* whack any lingering session directory files from our jobs */
            orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
            
            /* cleanup our data server */
            orte_data_server_finalize();
            
            /* exit with a non-zero status */
            exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
        }
        fprintf(stderr, "%s: abort is already in progress...hit ctrl-c again to forcibly terminate\n\n", orte_basename);
        forcibly_die = true;
        /* reset the event */
        opal_event_add(&term_handler, NULL);
        return;
    }
    /* ensure we exit with a non-zero status */
    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);

    /* ensure that the forwarding of stdin stops */
    orte_job_term_ordered = true;

    /* tell us to be quiet - hey, the user killed us with a ctrl-c,
     * so need to tell them that!
     */
    orte_execute_quiet = true;
    
    if (!orte_never_launched) {
        /* cleanup our data server */
        orte_data_server_finalize();
    }

    /* We are in an event handler; the job completed procedure
       will delete the signal handler that is currently running
       (which is a Bad Thing), so we can't call it directly.
       Instead, we have to exit this handler and setup to call
       job_completed() after this. */
    orte_plm.terminate_orteds();;
}
Beispiel #6
0
void orte_trigger_event(orte_trigger_event_t *trig)
{
    int data=1;
    
    OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
                         "%s calling %s trigger",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         trig->name));
    
    if (opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */
        return;
    }
        
    send(trig->channel, (const char*)&data, sizeof(int), 0);
    closesocket(trig->channel);
    opal_progress();
}
Beispiel #7
0
static int
atomic_spinlock_test(opal_atomic_lock_t *lock, int count, int id)
{
    int i;

    for (i = 0 ; i < count ; ++i) {
        opal_atomic_lock(lock);
        if (atomic_verbose) { printf("id %03d has the lock (lock)\n", id); }
        opal_atomic_unlock(lock);

        while (opal_atomic_trylock(lock)) { ; }
        if (atomic_verbose) { printf("id %03d has the lock (trylock)\n", id); }
        opal_atomic_unlock(lock);
    }

    return 0;
}
Beispiel #8
0
void orte_trigger_event(orte_trigger_event_t *trig)
{
    int data=1;
    
    OPAL_OUTPUT_VERBOSE((1, orte_debug_output,
                        "%s calling %s trigger",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         trig->name));
    
    /* if we already fired it, don't do it again - this automatically
     * records that we did fire it
     */
    if (opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */
        return;
    }
    
    write(trig->channel, &data, sizeof(int));
    close(trig->channel);
    opal_progress();
}
/*
 * This function gets called by the PLM when an orted notifies us that
 * a job failed to start.
 * Various components will follow their own strategy for dealing with
 * this situation. For this component, we simply kill the job.
 */
void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code)
{
    int rc;
    
    OPAL_TRACE(1);
    
    /* if we are already in progress, then ignore this call */
    if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
        OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
                             "%s errmgr:default: abort in progress, ignoring incomplete start on job %s with status %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(job), exit_code));
        return;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
                         "%s errmgr:default: job %s reported incomplete start with status %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(job), exit_code));

    orte_job_term_ordered = true;
    
    /* tell the plm to terminate all jobs */
    if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
        ORTE_ERROR_LOG(rc);
    }
    
    /* set the exit status, just in case whomever called us failed
     * to do so - it can only be done once, so we are protected
     * from overwriting it
     */
    ORTE_UPDATE_EXIT_STATUS(exit_code);
    
    /* wakeup orterun so we can exit */
    orte_trigger_event(&orte_exit);   
}
Beispiel #10
0
void orte_quit(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    /* cleanup */
    if (NULL != caddy) {
        OBJ_RELEASE(caddy);
    }

    /* check one-time lock to protect against "bounce" */
    if (opal_atomic_trylock(&orte_quit_lock)) { /* returns 1 if already locked */
        return;
    }

    /* if we are the hnp and haven't already reported it, then
     * report any errors
     */
    if (ORTE_PROC_IS_HNP && !errors_reported) {
        if (0 != orte_exit_status && !orte_execute_quiet) {
            errors_reported = true;
            /* abnormal termination of some kind */
            dump_aborted_procs();
            /* If we showed more abort messages than were allowed,
               show a followup message here */
            if (num_failed_start > 1) {
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "<stderr>");
                }
                fprintf(orte_xml_fp, "%d total process%s failed to start",
                        num_failed_start, ((num_failed_start > 1) ? "es" : ""));
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "&#010;</stderr>");
                }
                fprintf(orte_xml_fp, "\n");
            }
            if (num_aborted > 1) {
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "<stderr>");
                }
                fprintf(orte_xml_fp, "%d total process%s aborted",
                        num_aborted, ((num_aborted > 1) ? "es" : ""));
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "&#010;</stderr>");
                }
                fprintf(orte_xml_fp, "\n");
            }
            if (num_killed > 1) {
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "<stderr>");
                }
                fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)",
                        num_killed, ((num_killed > 1) ? "es" : ""), orte_basename);
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "&#010;</stderr>");
                }
                fprintf(orte_xml_fp, "\n");
            }
        }
    }

    /* flag that the event lib should no longer be looped
     * so we will exit
     */
    orte_event_base_active = false;
}
/*
 * This function gets called by the PLM when an orted notifies us
 * that a process has aborted
 * Various components will follow their own strategy for dealing with
 * this situation. For this component, we simply kill the job.
 */
void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code)
{
    int rc;
    orte_job_t **jobs;
    orte_std_cntr_t i;
    
    OPAL_TRACE(1);
    
    /* if we are already in progress, then ignore this call */
    if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */
        OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
                             "%s errmgr:default: abort in progress, ignoring proc %s aborted with status %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(name), exit_code));
        
        return;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output,
                         "%s errmgr:default: proc %s aborting with status %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(name), exit_code));
    
    orte_job_term_ordered = true;
    
    /* indicate that all jobs other than the one containing this
     * proc have been orted to abort - this is necessary to avoid
     * duplicate ordering of "abort".
     *
     * NOTE: be sure to not include the 0 job data location as this
     * contains the daemons!
     */
    jobs = (orte_job_t**)orte_job_data->addr;
    for (i=1; i < orte_job_data->size; i++) {
        /* the array is left justfied, so we can quit once
         * we see a NULL
         */
        if (NULL == jobs[i]) {
            break;
        }
        if (ORTE_JOB_STATE_ABORTED != jobs[i]->state &&
            ORTE_JOB_STATE_ABORTED_BY_SIG != jobs[i]->state &&
            ORTE_JOB_STATE_ABORTED_WO_SYNC != jobs[i]->state) {
            jobs[i]->state = ORTE_JOB_STATE_ABORT_ORDERED;
        }
    }
    
    /* tell the plm to terminate all jobs */
    if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
        ORTE_ERROR_LOG(rc);
    }
    
    /* set the exit status, just in case whomever called us failed
     * to do so - it can only be done once, so we are protected
     * from overwriting it
     */
    ORTE_UPDATE_EXIT_STATUS(exit_code);

    /* wakeup orterun so we can exit */
    orte_trigger_event(&orte_exit);    
}
Beispiel #12
0
/*
 * Progress the event library and any functions that have registered to 
 * be called.  We don't propogate errors from the progress functions,
 * so no action is taken if they return failures.  The functions are
 * expected to return the number of events progressed, to determine
 * whether or not we should call sched_yield() during MPI progress.
 * This is only losely tracked, as an error return can cause the number
 * of progressed events to appear lower than it actually is.  We don't
 * care, as the cost of that happening is far outweighed by the cost
 * of the if checks (they were resulting in bad pipe stalling behavior)
 */
void
opal_progress(void)
{
    size_t i;
    int events = 0;

    if( opal_progress_event_flag != 0 ) {
#if (OMPI_ENABLE_PROGRESS_THREADS == 0) && OPAL_HAVE_WORKING_EVENTOPS
#if OPAL_PROGRESS_USE_TIMERS
#if OPAL_TIMER_USEC_NATIVE
    opal_timer_t now = opal_timer_base_get_usec();
#else
    opal_timer_t now = opal_timer_base_get_cycles();
#endif  /* OPAL_TIMER_USEC_NATIVE */
    /* trip the event library if we've reached our tick rate and we are
       enabled */
        if (now - event_progress_last_time > event_progress_delta ) {
#if OMPI_HAVE_THREAD_SUPPORT
            if (opal_atomic_trylock(&progress_lock)) {
#endif  /* OMPI_HAVE_THREAD_SUPPORT */
                event_progress_last_time = (event_num_mpi_users > 0) ? 
                    now - event_progress_delta : now;

                events += opal_event_loop(opal_progress_event_flag);
#if OMPI_HAVE_THREAD_SUPPORT
                opal_atomic_unlock(&progress_lock);
            }
#endif  /* OMPI_HAVE_THREAD_SUPPORT */
        }

#else /* OPAL_PROGRESS_USE_TIMERS */
    /* trip the event library if we've reached our tick rate and we are
       enabled */
        if (OPAL_THREAD_ADD32(&event_progress_counter, -1) <= 0 ) {
#if OMPI_HAVE_THREAD_SUPPORT
            if (opal_atomic_trylock(&progress_lock)) {
#endif  /* OMPI_HAVE_THREAD_SUPPORT */
                event_progress_counter = 
                    (event_num_mpi_users > 0) ? 0 : event_progress_delta;
                events += opal_event_loop(opal_progress_event_flag);
#if OMPI_HAVE_THREAD_SUPPORT
                opal_atomic_unlock(&progress_lock);
            }
#endif  /* OMPI_HAVE_THREAD_SUPPORT */
        }
#endif /* OPAL_PROGRESS_USE_TIMERS */

#endif /* OMPI_ENABLE_PROGRESS_THREADS == 0 && OPAL_HAVE_WORKING_EVENTOPS */
    }

    /* progress all registered callbacks */
    for (i = 0 ; i < callbacks_len ; ++i) {
        events += (callbacks[i])();
    }

#if defined(__WINDOWS__) || defined(HAVE_SCHED_YIELD)
    if (call_yield && events <= 0) {
        /* If there is nothing to do - yield the processor - otherwise
         * we could consume the processor for the entire time slice. If
         * the processor is oversubscribed - this will result in a best-case
         * latency equivalent to the time-slice.
         */
#if defined(__WINDOWS__)
        SwitchToThread();
#else
        sched_yield();
#endif  /* defined(__WINDOWS__) */
    }
#endif  /* defined(__WINDOWS__) || defined(HAVE_SCHED_YIELD) */
}