int orte_ess_base_app_finalize(void) { orte_cr_finalize(); #if OPAL_ENABLE_FT_CR == 1 (void) mca_base_framework_close(&orte_snapc_base_framework); (void) mca_base_framework_close(&orte_sstore_base_framework); #endif /* close frameworks */ (void) mca_base_framework_close(&orte_filem_base_framework); (void) mca_base_framework_close(&orte_errmgr_base_framework); /* now can close the rml and its friendly group comm */ (void) mca_base_framework_close(&orte_grpcomm_base_framework); (void) mca_base_framework_close(&orte_dfs_base_framework); (void) mca_base_framework_close(&orte_routed_base_framework); (void) mca_base_framework_close(&orte_rml_base_framework); (void) mca_base_framework_close(&orte_oob_base_framework); (void) mca_base_framework_close(&orte_state_base_framework); orte_session_dir_finalize(ORTE_PROC_MY_NAME); /* release the event base */ if (progress_thread_running) { opal_progress_thread_finalize(NULL); progress_thread_running = false; } return ORTE_SUCCESS; }
int orte_ess_base_app_finalize(void) { orte_notifier_base_close(); orte_cr_finalize(); #if OPAL_ENABLE_FT_CR == 1 orte_snapc_base_close(); #endif orte_filem_base_close(); orte_wait_finalize(); /* close the multicast */ #if ORTE_ENABLE_MULTICAST orte_rmcast_base_close(); #endif /* now can close the rml and its friendly group comm */ orte_grpcomm_base_close(); orte_routed_base_close(); orte_rml_base_close(); orte_session_dir_finalize(ORTE_PROC_MY_NAME); return ORTE_SUCCESS; }
static void rte_abort(int status, bool report) { /* do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition * that precludes normal cleanup * * We do need to do the following bits to make sure we leave a * clean environment. Taken from orte_finalize(): * - Assume errmgr cleans up child processes before we exit. */ /* CRS cleanup since it may have a named pipe and thread active */ orte_cr_finalize(); /* ensure we scrub the session directory tree */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* - Clean out the global structures * (not really necessary, but good practice) */ orte_proc_info_finalize(); /* just exit */ exit(status); }
void orte_ess_base_app_abort(int status, bool report) { orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_ABORT_CALLED; opal_buffer_t *buf; /* Exit - do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition * that precludes normal cleanup * * We do need to do the following bits to make sure we leave a * clean environment. Taken from orte_finalize(): * - Assume errmgr cleans up child processes before we exit. */ /* CRS cleanup since it may have a named pipe and thread active */ orte_cr_finalize(); /* If we were asked to report this termination, do so */ if (report) { buf = OBJ_NEW(opal_buffer_t); opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD); orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, 0, orte_rml_send_callback, NULL); OPAL_OUTPUT_VERBOSE((5, orte_debug_output, "%s orte_ess_app_abort: sent abort msg to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON))); /* get the ack - need this to ensure that the sync communication * gets serviced by the event library on the orted prior to the * process exiting */ sync_waiting = true; if (ORTE_SUCCESS != orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT, ORTE_RML_NON_PERSISTENT, report_sync, NULL)) { exit(status); } ORTE_WAIT_FOR_COMPLETION(sync_waiting); } /* - Clean out the global structures * (not really necessary, but good practice) */ orte_proc_info_finalize(); /* Now Exit */ exit(status); }
/* * We do NOT call the regular C-library "abort" function, even * though that would have alerted us to the fact that this is * an abnormal termination, because it would automatically cause * a core file to be generated. On large systems, that can be * overwhelming (imagine a few thousand Gbyte-sized files hitting * a shared file system simultaneously...ouch!). * * However, this causes a problem for OpenRTE as the system truly * needs to know that this actually IS an abnormal termination. * To get around the problem, we create a file in the session * directory - we don't need to put anything in it, though, as its * very existence simply alerts us that this was an abnormal * termination. * * The session directory finalize system will clean this file up * for us automagically. However, it needs to stick around long * enough for our local daemon to find it! So, we do NOT call * session_dir_finalize here!!! Someone will clean up for us. * * In some cases, however, we DON'T want to create that alert. For * example, if an orted detects that the HNP has died, then there * is truly nobody to alert! In these cases, we pass report=false * to prevent the abort file from being created. This allows the * session directory tree to cleanly be eliminated. */ void orte_ess_base_app_abort(int status, bool report) { char *abort_file; int fd; /* Exit - do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition * that precludes normal cleanup * * We do need to do the following bits to make sure we leave a * clean environment. Taken from orte_finalize(): * - Assume errmgr cleans up child processes before we exit. */ /* CRS cleanup since it may have a named pipe and thread active */ orte_cr_finalize(); /* If we were asked to report this termination, * write an "abort" file into our session directory */ if (report) { abort_file = opal_os_path(false, orte_process_info.proc_session_dir, "abort", NULL); if (NULL == abort_file) { /* got a problem */ ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); goto CLEANUP; } OPAL_OUTPUT_VERBOSE((5, orte_debug_output, "%s orte_ess_app_abort: dropping abort file %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), abort_file)); fd = open(abort_file, O_CREAT, 0600); if (0 < fd) close(fd); } CLEANUP: /* - Clean out the global structures * (not really necessary, but good practice) */ orte_proc_info_finalize(); /* Now Exit */ exit(status); }
/* * We do NOT call the regular C-library "abort" function, even * though that would have alerted us to the fact that this is * an abnormal termination, because it would automatically cause * a core file to be generated. On large systems, that can be * overwhelming (imagine a few thousand Gbyte-sized files hitting * a shared file system simultaneously...ouch!). * * However, this causes a problem for OpenRTE as the system truly * needs to know that this actually IS an abnormal termination. * To get around the problem, we drop a marker in the proc-level * session dir. If session dir's were not allowed, then we just * ignore this question. * * In some cases, however, we DON'T want to create that alert. For * example, if an orted detects that the HNP has died, then there * is truly nobody to alert! In these cases, we pass report=false * to indicate that we don't want the marker dropped. */ void orte_ess_base_app_abort(int status, bool report) { int fd; char *myfile; struct timespec tp = {0, 100000}; /* Exit - do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition * that precludes normal cleanup * * We do need to do the following bits to make sure we leave a * clean environment. Taken from orte_finalize(): * - Assume errmgr cleans up child processes before we exit. */ /* CRS cleanup since it may have a named pipe and thread active */ orte_cr_finalize(); /* If we were asked to report this termination, do so. * Since singletons don't start an HNP unless necessary, and * direct-launched procs don't have daemons at all, only send * the message if routing is enabled as this indicates we * have someone to send to */ if (report && orte_routing_is_enabled && orte_create_session_dirs) { myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL); fd = open(myfile, O_CREAT, S_IRUSR); close(fd); /* now introduce a short delay to allow any pending * messages (e.g., from a call to "show_help") to * have a chance to be sent */ nanosleep(&tp, NULL); } /* - Clean out the global structures * (not really necessary, but good practice) */ orte_proc_info_finalize(); /* Now Exit */ _exit(status); }
int orte_ess_base_app_finalize(void) { orte_cr_finalize(); #if OPAL_ENABLE_FT_CR == 1 orte_snapc_base_close(); #endif orte_filem_base_close(); orte_wait_finalize(); orte_errmgr_base_close(); /* now can close the rml and its friendly group comm */ orte_grpcomm_base_close(); orte_db_base_close(); orte_routed_base_close(); orte_rml_base_close(); orte_session_dir_finalize(ORTE_PROC_MY_NAME); return ORTE_SUCCESS; }
int orte_ess_base_app_finalize(void) { orte_cr_finalize(); /* release the event base so we stop all potential * race conditions in the messaging teardown */ if (progress_thread_running) { opal_stop_progress_thread("orte", false); progress_thread_running = false; } #if OPAL_ENABLE_FT_CR == 1 (void) mca_base_framework_close(&orte_snapc_base_framework); (void) mca_base_framework_close(&orte_sstore_base_framework); #endif /* close frameworks */ (void) mca_base_framework_close(&orte_filem_base_framework); (void) mca_base_framework_close(&orte_errmgr_base_framework); /* now can close the rml and its friendly group comm */ (void) mca_base_framework_close(&orte_grpcomm_base_framework); (void) mca_base_framework_close(&opal_dstore_base_framework); (void) mca_base_framework_close(&orte_dfs_base_framework); (void) mca_base_framework_close(&orte_routed_base_framework); (void) mca_base_framework_close(&orte_rml_base_framework); (void) mca_base_framework_close(&orte_oob_base_framework); (void) mca_base_framework_close(&orte_state_base_framework); orte_session_dir_finalize(ORTE_PROC_MY_NAME); /* free the event base to cleanup memory */ opal_stop_progress_thread("orte", true); return ORTE_SUCCESS; }
static int rte_finalize(void) { char *contact_path; opal_list_item_t *item; /* remove my contact info file */ contact_path = opal_os_path(false, orte_process_info.top_session_dir, "contact.txt", NULL); unlink(contact_path); free(contact_path); orte_notifier_base_close(); orte_cr_finalize(); #if OPAL_ENABLE_FT == 1 orte_snapc_base_close(); #endif orte_filem_base_close(); orte_odls_base_close(); orte_wait_finalize(); orte_iof_base_close(); /* finalize selected modules so they can de-register * any receives */ orte_ras_base_close(); orte_rmaps_base_close(); orte_plm_base_close(); orte_errmgr_base_close(); /* now can close the rml and its friendly group comm */ orte_grpcomm_base_close(); orte_routed_base_close(); orte_rml_base_close(); /* cleanup the global list of local children and job data */ while (NULL != (item = opal_list_remove_first(&orte_local_children))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&orte_local_children); while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&orte_local_jobdata); /* finalize the session directory tree */ orte_session_dir_finalize(ORTE_PROC_MY_NAME); /* clean out the global structures */ orte_proc_info_finalize(); if (NULL != orte_job_ident) { free(orte_job_ident); } /* close the xml output file, if open */ if (orte_xml_output) { fprintf(orte_xml_fp, "</%s>\n", orte_cmd_basename); fflush(orte_xml_fp); if (stdout != orte_xml_fp) { fclose(orte_xml_fp); } } return ORTE_SUCCESS; }
static int rte_finalize(void) { char *contact_path; opal_list_item_t *item; orte_node_t *node; orte_job_t *job; int i; /* remove my contact info file */ contact_path = opal_os_path(false, orte_process_info.top_session_dir, "contact.txt", NULL); unlink(contact_path); free(contact_path); orte_notifier_base_close(); orte_cr_finalize(); #if OPAL_ENABLE_FT_CR == 1 orte_snapc_base_close(); #endif orte_filem_base_close(); orte_odls_base_close(); orte_wait_finalize(); orte_iof_base_close(); /* finalize selected modules so they can de-register * any receives */ orte_ras_base_close(); orte_rmaps_base_close(); orte_plm_base_close(); orte_errmgr_base_close(); /* close the multicast */ #if ORTE_ENABLE_MULTICAST orte_rmcast_base_close(); #endif /* now can close the rml and its friendly group comm */ orte_grpcomm_base_close(); orte_routed_base_close(); orte_rml_base_close(); /* if we were doing timing studies, close the timing file */ if (orte_timing) { if (stdout != orte_timing_output && stderr != orte_timing_output) { fclose(orte_timing_output); } } /* cleanup the global list of local children and job data */ while (NULL != (item = opal_list_remove_first(&orte_local_children))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&orte_local_children); while (NULL != (item = opal_list_remove_first(&orte_local_jobdata))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&orte_local_jobdata); /* cleanup the job and node info arrays */ if (NULL != orte_node_pool) { for (i=0; i < orte_node_pool->size; i++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool,i))) { OBJ_RELEASE(node); } } OBJ_RELEASE(orte_node_pool); } if (NULL != orte_job_data) { for (i=0; i < orte_job_data->size; i++) { if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data,i))) { OBJ_RELEASE(job); } } OBJ_RELEASE(orte_job_data); } /* finalize the session directory tree */ orte_session_dir_finalize(ORTE_PROC_MY_NAME); /* clean out the global structures */ orte_proc_info_finalize(); if (NULL != orte_job_ident) { free(orte_job_ident); } /* close the xml output file, if open */ if (orte_xml_output) { fprintf(orte_xml_fp, "</mpirun>\n"); fflush(orte_xml_fp); if (stdout != orte_xml_fp) { fclose(orte_xml_fp); } } /* handle the orted-specific OPAL stuff */ opal_sysinfo_base_close(); opal_pstat_base_close(); return ORTE_SUCCESS; }