Example #1
0
int orte_ess_base_app_finalize(void)
{
    orte_cr_finalize();

#if OPAL_ENABLE_FT_CR == 1
    (void) mca_base_framework_close(&orte_snapc_base_framework);
    (void) mca_base_framework_close(&orte_sstore_base_framework);
#endif

    /* close frameworks */
    (void) mca_base_framework_close(&orte_filem_base_framework);
    (void) mca_base_framework_close(&orte_errmgr_base_framework);

    /* now can close the rml and its friendly group comm */
    (void) mca_base_framework_close(&orte_grpcomm_base_framework);
    (void) mca_base_framework_close(&orte_dfs_base_framework);
    (void) mca_base_framework_close(&orte_routed_base_framework);

    (void) mca_base_framework_close(&orte_rml_base_framework);
    (void) mca_base_framework_close(&orte_oob_base_framework);
    (void) mca_base_framework_close(&orte_state_base_framework);

    orte_session_dir_finalize(ORTE_PROC_MY_NAME);

    /* release the event base */
    if (progress_thread_running) {
        opal_progress_thread_finalize(NULL);
        progress_thread_running = false;
    }

    return ORTE_SUCCESS;
}
int orte_ess_base_app_finalize(void)
{
    orte_notifier_base_close();
    
    orte_cr_finalize();
    
#if OPAL_ENABLE_FT_CR == 1
    orte_snapc_base_close();
#endif
    orte_filem_base_close();
    
    orte_wait_finalize();
    
    /* close the multicast */
#if ORTE_ENABLE_MULTICAST
    orte_rmcast_base_close();
#endif
    
    /* now can close the rml and its friendly group comm */
    orte_grpcomm_base_close();
    orte_routed_base_close();
    orte_rml_base_close();
    
    orte_session_dir_finalize(ORTE_PROC_MY_NAME);
        
    return ORTE_SUCCESS;    
}
Example #3
0
static void rte_abort(int status, bool report)
{
    /* do NOT do a normal finalize as this will very likely
     * hang the process. We are aborting due to an abnormal condition
     * that precludes normal cleanup 
     *
     * We do need to do the following bits to make sure we leave a 
     * clean environment. Taken from orte_finalize():
     * - Assume errmgr cleans up child processes before we exit.
     */
    
    /* CRS cleanup since it may have a named pipe and thread active */
    orte_cr_finalize();
    
    /* ensure we scrub the session directory tree */
    orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
    
    /* - Clean out the global structures 
     * (not really necessary, but good practice)
     */
    orte_proc_info_finalize();
    
    /* just exit */
    exit(status);
}
void orte_ess_base_app_abort(int status, bool report)
{
    orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_ABORT_CALLED;
    opal_buffer_t *buf;
    
    /* Exit - do NOT do a normal finalize as this will very likely
     * hang the process. We are aborting due to an abnormal condition
     * that precludes normal cleanup 
     *
     * We do need to do the following bits to make sure we leave a 
     * clean environment. Taken from orte_finalize():
     * - Assume errmgr cleans up child processes before we exit.
     */
    
    /* CRS cleanup since it may have a named pipe and thread active */
    orte_cr_finalize();
    
    /* If we were asked to report this termination, do so */
    if (report) {
        buf = OBJ_NEW(opal_buffer_t);
        opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD);
        orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, 0, orte_rml_send_callback, NULL);
        OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
                             "%s orte_ess_app_abort: sent abort msg to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON)));
        /* get the ack - need this to ensure that the sync communication
         * gets serviced by the event library on the orted prior to the
         * process exiting
         */
        sync_waiting = true;
        if (ORTE_SUCCESS != orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT,
                                                    ORTE_RML_NON_PERSISTENT, report_sync, NULL)) {
            exit(status);
        }
        ORTE_WAIT_FOR_COMPLETION(sync_waiting);
    }
    
    /* - Clean out the global structures 
     * (not really necessary, but good practice) */
    orte_proc_info_finalize();
    
    /* Now Exit */
    exit(status);
}
/*
 * We do NOT call the regular C-library "abort" function, even
 * though that would have alerted us to the fact that this is
 * an abnormal termination, because it would automatically cause
 * a core file to be generated. On large systems, that can be
 * overwhelming (imagine a few thousand Gbyte-sized files hitting
                 * a shared file system simultaneously...ouch!).
 *
 * However, this causes a problem for OpenRTE as the system truly
 * needs to know that this actually IS an abnormal termination.
 * To get around the problem, we create a file in the session
 * directory - we don't need to put anything in it, though, as its
 * very existence simply alerts us that this was an abnormal
 * termination.
 *
 * The session directory finalize system will clean this file up
 * for us automagically. However, it needs to stick around long
 * enough for our local daemon to find it! So, we do NOT call
 * session_dir_finalize here!!! Someone will clean up for us.
 *
 * In some cases, however, we DON'T want to create that alert. For
 * example, if an orted detects that the HNP has died, then there
 * is truly nobody to alert! In these cases, we pass report=false
 * to prevent the abort file from being created. This allows the
 * session directory tree to cleanly be eliminated.
 */
void orte_ess_base_app_abort(int status, bool report)
{
    char *abort_file;
    int fd;
    
    /* Exit - do NOT do a normal finalize as this will very likely
     * hang the process. We are aborting due to an abnormal condition
     * that precludes normal cleanup 
     *
     * We do need to do the following bits to make sure we leave a 
     * clean environment. Taken from orte_finalize():
     * - Assume errmgr cleans up child processes before we exit.
     */
    
    /* CRS cleanup since it may have a named pipe and thread active */
    orte_cr_finalize();
    
    /* If we were asked to report this termination,
     * write an "abort" file into our session directory
     */
    if (report) {
        abort_file = opal_os_path(false, orte_process_info.proc_session_dir, "abort", NULL);
        if (NULL == abort_file) {
            /* got a problem */
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            goto CLEANUP;
        }
        OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
                             "%s orte_ess_app_abort: dropping abort file %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), abort_file));
        fd = open(abort_file, O_CREAT, 0600);
        if (0 < fd) close(fd);        
    }
    
CLEANUP:
    /* - Clean out the global structures 
     * (not really necessary, but good practice) */
    orte_proc_info_finalize();
    
    /* Now Exit */
    exit(status);
}
Example #6
0
/*
 * We do NOT call the regular C-library "abort" function, even
 * though that would have alerted us to the fact that this is
 * an abnormal termination, because it would automatically cause
 * a core file to be generated. On large systems, that can be
 * overwhelming (imagine a few thousand Gbyte-sized files hitting
                 * a shared file system simultaneously...ouch!).
 *
 * However, this causes a problem for OpenRTE as the system truly
 * needs to know that this actually IS an abnormal termination.
 * To get around the problem, we drop a marker in the proc-level
 * session dir. If session dir's were not allowed, then we just
 * ignore this question.
 *
 * In some cases, however, we DON'T want to create that alert. For
 * example, if an orted detects that the HNP has died, then there
 * is truly nobody to alert! In these cases, we pass report=false
 * to indicate that we don't want the marker dropped.
 */
void orte_ess_base_app_abort(int status, bool report)
{
    int fd;
    char *myfile;
    struct timespec tp = {0, 100000};

    /* Exit - do NOT do a normal finalize as this will very likely
     * hang the process. We are aborting due to an abnormal condition
     * that precludes normal cleanup 
     *
     * We do need to do the following bits to make sure we leave a 
     * clean environment. Taken from orte_finalize():
     * - Assume errmgr cleans up child processes before we exit.
     */
    
    /* CRS cleanup since it may have a named pipe and thread active */
    orte_cr_finalize();
    
    /* If we were asked to report this termination, do so.
     * Since singletons don't start an HNP unless necessary, and
     * direct-launched procs don't have daemons at all, only send
     * the message if routing is enabled as this indicates we
     * have someone to send to
     */
    if (report && orte_routing_is_enabled && orte_create_session_dirs) {
        myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL);
        fd = open(myfile, O_CREAT, S_IRUSR);
        close(fd);
        /* now introduce a short delay to allow any pending
         * messages (e.g., from a call to "show_help") to
         * have a chance to be sent */
        nanosleep(&tp, NULL);
    }
    
    /* - Clean out the global structures 
     * (not really necessary, but good practice) */
    orte_proc_info_finalize();
    
    /* Now Exit */
    _exit(status);
}
int orte_ess_base_app_finalize(void)
{    
    orte_cr_finalize();
    
#if OPAL_ENABLE_FT_CR == 1
    orte_snapc_base_close();
#endif
    orte_filem_base_close();
    
    orte_wait_finalize();

    orte_errmgr_base_close();

    /* now can close the rml and its friendly group comm */
    orte_grpcomm_base_close();
    orte_db_base_close();
    orte_routed_base_close();
    orte_rml_base_close();
    
    orte_session_dir_finalize(ORTE_PROC_MY_NAME);
        
    return ORTE_SUCCESS;    
}
Example #8
0
int orte_ess_base_app_finalize(void)
{
    orte_cr_finalize();

    /* release the event base so we stop all potential
     * race conditions in the messaging teardown */
    if (progress_thread_running) {
        opal_stop_progress_thread("orte", false);
        progress_thread_running = false;
    }

#if OPAL_ENABLE_FT_CR == 1
    (void) mca_base_framework_close(&orte_snapc_base_framework);
    (void) mca_base_framework_close(&orte_sstore_base_framework);
#endif

    /* close frameworks */
    (void) mca_base_framework_close(&orte_filem_base_framework);
    (void) mca_base_framework_close(&orte_errmgr_base_framework);

    /* now can close the rml and its friendly group comm */
    (void) mca_base_framework_close(&orte_grpcomm_base_framework);
    (void) mca_base_framework_close(&opal_dstore_base_framework);
    (void) mca_base_framework_close(&orte_dfs_base_framework);
    (void) mca_base_framework_close(&orte_routed_base_framework);

    (void) mca_base_framework_close(&orte_rml_base_framework);
    (void) mca_base_framework_close(&orte_oob_base_framework);
    (void) mca_base_framework_close(&orte_state_base_framework);

    orte_session_dir_finalize(ORTE_PROC_MY_NAME);

    /* free the event base to cleanup memory */
    opal_stop_progress_thread("orte", true);
    return ORTE_SUCCESS;    
}
static int rte_finalize(void)
{
    char *contact_path;
    opal_list_item_t *item;

    /* remove my contact info file */
    contact_path = opal_os_path(false, orte_process_info.top_session_dir,
                                "contact.txt", NULL);
    unlink(contact_path);
    free(contact_path);
    
    orte_notifier_base_close();
    
    orte_cr_finalize();
    
#if OPAL_ENABLE_FT == 1
    orte_snapc_base_close();
#endif
    orte_filem_base_close();
    
    orte_odls_base_close();
    
    orte_wait_finalize();
    orte_iof_base_close();
    
    /* finalize selected modules so they can de-register
     * any receives
     */
    orte_ras_base_close();
    orte_rmaps_base_close();
    orte_plm_base_close();
    orte_errmgr_base_close();
    
    /* now can close the rml and its friendly group comm */
    orte_grpcomm_base_close();
    orte_routed_base_close();
    orte_rml_base_close();
    
    /* cleanup the global list of local children and job data */
    while (NULL != (item = opal_list_remove_first(&orte_local_children))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&orte_local_children);
    while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&orte_local_jobdata);
    
    /* finalize the session directory tree */
    orte_session_dir_finalize(ORTE_PROC_MY_NAME);
    
    /* clean out the global structures */
    orte_proc_info_finalize();
    if (NULL != orte_job_ident) {
        free(orte_job_ident);
    }

    
    /* close the xml output file, if open */
    if (orte_xml_output) {
        fprintf(orte_xml_fp, "</%s>\n", orte_cmd_basename);
        fflush(orte_xml_fp);
        if (stdout != orte_xml_fp) {
            fclose(orte_xml_fp);
        }
    }
    
    return ORTE_SUCCESS;    
}
static int rte_finalize(void)
{
    char *contact_path;
    opal_list_item_t *item;
    orte_node_t *node;
    orte_job_t *job;
    int i;

    /* remove my contact info file */
    contact_path = opal_os_path(false, orte_process_info.top_session_dir,
                                "contact.txt", NULL);
    unlink(contact_path);
    free(contact_path);
    
    orte_notifier_base_close();
    
    orte_cr_finalize();
    
#if OPAL_ENABLE_FT_CR == 1
    orte_snapc_base_close();
#endif
    orte_filem_base_close();
    
    orte_odls_base_close();
    
    orte_wait_finalize();
    orte_iof_base_close();
    
    /* finalize selected modules so they can de-register
     * any receives
     */
    orte_ras_base_close();
    orte_rmaps_base_close();
    orte_plm_base_close();
    orte_errmgr_base_close();
    
    /* close the multicast */
#if ORTE_ENABLE_MULTICAST
    orte_rmcast_base_close();
#endif

    /* now can close the rml and its friendly group comm */
    orte_grpcomm_base_close();
    orte_routed_base_close();
    orte_rml_base_close();

    /* if we were doing timing studies, close the timing file */
    if (orte_timing) {
        if (stdout != orte_timing_output &&
            stderr != orte_timing_output) {
            fclose(orte_timing_output);
        }
    }
    
    /* cleanup the global list of local children and job data */
    while (NULL != (item = opal_list_remove_first(&orte_local_children))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&orte_local_children);
    while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&orte_local_jobdata);
    
    /* cleanup the job and node info arrays */
    if (NULL != orte_node_pool) {
        for (i=0; i < orte_node_pool->size; i++) {
            if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool,i))) {
                OBJ_RELEASE(node);
            }
        }
        OBJ_RELEASE(orte_node_pool);
    }
    if (NULL != orte_job_data) {
        for (i=0; i < orte_job_data->size; i++) {
            if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data,i))) {
                OBJ_RELEASE(job);
            }
        }
        OBJ_RELEASE(orte_job_data);
    }

    /* finalize the session directory tree */
    orte_session_dir_finalize(ORTE_PROC_MY_NAME);
    
    /* clean out the global structures */
    orte_proc_info_finalize();
    if (NULL != orte_job_ident) {
        free(orte_job_ident);
    }
    
    /* close the xml output file, if open */
    if (orte_xml_output) {
        fprintf(orte_xml_fp, "</mpirun>\n");
        fflush(orte_xml_fp);
        if (stdout != orte_xml_fp) {
            fclose(orte_xml_fp);
        }
    }
    
    /* handle the orted-specific OPAL stuff */
    opal_sysinfo_base_close();
    opal_pstat_base_close();

    return ORTE_SUCCESS;    
}