int main(int argc, char** argv) { int rc; opal_thread_t thr1; opal_thread_t thr2; test_init("opal_thread_t"); OBJ_CONSTRUCT(&thr1, opal_thread_t); OBJ_CONSTRUCT(&thr2, opal_thread_t); thr1.t_run = thr1_run; thr2.t_run = thr2_run; rc = opal_thread_start(&thr1); test_verify_int(OPAL_SUCCESS, rc); rc = opal_thread_start(&thr2); test_verify_int(OPAL_SUCCESS, rc); rc = opal_thread_join(&thr1, NULL); test_verify_int(OPAL_SUCCESS, rc); rc = opal_thread_join(&thr2, NULL); test_verify_int(OPAL_SUCCESS, rc); test_verify_int(3, count); return test_finalize(); }
int orte_rmcast_base_start_threads(void) { int rc; if (!orte_rmcast_base.recv_process_ctl.running) { OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output, "%s rmcast:base: starting recv processing thread", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* setup a pipe that we will use to signal the thread that a message * is waiting to be processed - don't define an event for it */ if (pipe(orte_rmcast_base.recv_pipe) < 0) { opal_output(0, "%s Cannot open recv processing thread ctl pipe", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERR_OUT_OF_RESOURCE; } /* start the thread - we will send it a NULL msg pointer when * we want it to stop */ orte_rmcast_base.recv_process.t_run = rcv_processing_thread; if (ORTE_SUCCESS != (rc = opal_thread_start(&orte_rmcast_base.recv_process))) { ORTE_ERROR_LOG(rc); orte_rmcast_base.recv_process_ctl.running = false; return rc; } OPAL_OUTPUT_VERBOSE((5, orte_rmcast_base.rmcast_output, "%s rmcast:base: recv processing thread started", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } return ORTE_SUCCESS; }
/* * Initialize global variables used w/in this module. */ static void tcp_init(void) { /* setup the module's state variables */ OBJ_CONSTRUCT(&mca_oob_tcp_module.peers, opal_hash_table_t); opal_hash_table_init(&mca_oob_tcp_module.peers, 32); mca_oob_tcp_module.ev_active = false; if (orte_oob_base.use_module_threads) { /* if we are to use independent progress threads at * the module level, start it now */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s STARTING TCP PROGRESS THREAD", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_tcp_module.ev_base = opal_event_base_create(); /* construct the thread object */ OBJ_CONSTRUCT(&mca_oob_tcp_module.progress_thread, opal_thread_t); /* fork off a thread to progress it */ mca_oob_tcp_module.progress_thread.t_run = progress_thread_engine; mca_oob_tcp_module.ev_active = true; if (OPAL_SUCCESS != opal_thread_start(&mca_oob_tcp_module.progress_thread)) { opal_output(0, "%s progress thread failed to start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } } }
opal_event_base_t *opal_progress_thread_init(const char *name) { assert(NULL == name); /* Create the event base */ agent_evbase = opal_event_base_create(); if (NULL == agent_evbase) { return NULL; } /* add an event to the new event base (if there are no events, opal_event_loop() will return immediately) */ opal_event_set(agent_evbase, &blocker, -1, OPAL_EV_PERSIST, blocker_timeout_cb, NULL); opal_event_add(&blocker, &long_timeout); /* Spawn the agent thread event loop */ OBJ_CONSTRUCT(&agent_thread, opal_thread_t); agent_thread.t_run = agent_thread_main; agent_thread.t_arg = NULL; int ret; ret = opal_thread_start(&agent_thread); if (OPAL_SUCCESS != ret) { OPAL_ERROR_LOG(ret); ABORT("Failed to start usNIC agent thread"); /* Will not return */ } return agent_evbase; }
int main(int argc, char** argv) { int rc; opal_thread_t* thr1; opal_thread_t* thr2; test_init("opal_condition_t"); rc = opal_init(&argc, &argv); test_verify_int(OPAL_SUCCESS, rc); if (OPAL_SUCCESS != rc) { test_finalize(); exit(1); } opal_set_using_threads(true); OBJ_CONSTRUCT(&mutex, opal_mutex_t); OBJ_CONSTRUCT(&thr1_cond, opal_condition_t); OBJ_CONSTRUCT(&thr2_cond, opal_condition_t); thr1 = OBJ_NEW(opal_thread_t); thr2 = OBJ_NEW(opal_thread_t); thr1->t_run = thr1_run; thr2->t_run = thr2_run; rc = opal_thread_start(thr1); test_verify_int(OPAL_SUCCESS, rc); rc = opal_thread_start(thr2); test_verify_int(OPAL_SUCCESS, rc); rc = opal_thread_join(thr1, NULL); test_verify_int(OPAL_SUCCESS, rc); test_verify_int(TEST_COUNT, thr1_count); rc = opal_thread_join(thr2, NULL); test_verify_int(OPAL_SUCCESS, rc); test_verify_int(TEST_COUNT, thr2_count); opal_finalize(); return test_finalize(); }
OMPI_DECLSPEC void mca_io_base_request_progress_add(void) { #if OMPI_ENABLE_PROGRESS_THREADS /* if we don't have a progress thread, make us have a progress thread */ if (! thread_running) { OPAL_THREAD_LOCK(&progress_mutex); if (! thread_running) { thread_running = true; opal_thread_start(&progress_thread); } OPAL_THREAD_UNLOCK(&progress_mutex); } #endif /* OMPI_ENABLE_PROGRESS_THREADS */ OPAL_THREAD_ADD32(&mca_io_base_request_num_pending, 1); #if OMPI_ENABLE_PROGRESS_THREADS opal_condition_signal(&progress_cond); #endif /* OMPI_ENABLE_PROGRESS_THREADS */ }
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) { int ret; char *error = NULL; if (0 < orte_initialized) { /* track number of times we have been called */ orte_initialized++; return ORTE_SUCCESS; } orte_initialized++; /* initialize the opal layer */ if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) { error = "opal_init"; goto error; } /* ensure we know the type of proc for when we finalize */ orte_process_info.proc_type = flags; /* setup the locks */ if (ORTE_SUCCESS != (ret = orte_locks_init())) { error = "orte_locks_init"; goto error; } /* Register all MCA Params */ if (ORTE_SUCCESS != (ret = orte_register_params())) { error = "orte_register_params"; goto error; } /* setup the orte_show_help system */ if (ORTE_SUCCESS != (ret = orte_show_help_init())) { error = "opal_output_init"; goto error; } /* register handler for errnum -> string conversion */ opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str); /* Ensure the rest of the process info structure is initialized */ if (ORTE_SUCCESS != (ret = orte_proc_info())) { error = "orte_proc_info"; goto error; } /* open the ESS and select the correct module for this environment */ if (ORTE_SUCCESS != (ret = orte_ess_base_open())) { error = "orte_ess_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ess_base_select())) { error = "orte_ess_base_select"; goto error; } if (ORTE_PROC_IS_APP) { #if !ORTE_DISABLE_FULL_SUPPORT && ORTE_ENABLE_PROGRESS_THREADS #if OPAL_EVENT_HAVE_THREAD_SUPPORT /* get a separate orte event base */ orte_event_base = opal_event_base_create(); /* setup the finalize event - we'll need it * to break the thread out of the event lib * when we want to stop it */ opal_event_set(orte_event_base, &orte_finalize_event, -1, OPAL_EV_WRITE, ignore_callback, NULL); opal_event_set_priority(&orte_finalize_event, ORTE_ERROR_PRI); #if 0 { /* seems strange, but wake us up once a second just so we can check for new events */ opal_event_t *ev; struct timeval tv = {1,0}; ev = opal_event_alloc(); opal_event_evtimer_set(orte_event_base, ev, ignore_callback, ev); opal_event_set_priority(ev, ORTE_INFO_PRI); opal_event_evtimer_add(ev, &tv); } #endif /* construct the thread object */ OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t); /* fork off a thread to progress it */ orte_progress_thread.t_run = orte_progress_thread_engine; if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) { error = "orte progress thread start"; goto error; } #else error = "event thread support is not configured"; ret = ORTE_ERROR; goto error; #endif #else /* set the event base to the opal one */ orte_event_base = opal_event_base; #endif } else { /* set the event base to the opal one */ orte_event_base = opal_event_base; } /* initialize the RTE for this environment */ if (ORTE_SUCCESS != (ret = orte_ess.init())) { error = "orte_ess_init"; goto error; } /* All done */ return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
int opal_cr_init(void ) { int ret, exit_status = OPAL_SUCCESS; opal_cr_coord_callback_fn_t prev_coord_func; if( ++opal_cr_initalized != 1 ) { if( opal_cr_initalized < 1 ) { exit_status = OPAL_ERROR; goto cleanup; } exit_status = OPAL_SUCCESS; goto cleanup; } ret = opal_cr_register (); if (OPAL_SUCCESS != ret) { return ret; } if(0 != opal_cr_verbose) { opal_cr_output = opal_output_open(NULL); opal_output_set_verbosity(opal_cr_output, opal_cr_verbose); } opal_output_verbose(10, opal_cr_output, "opal_cr: init: Verbose Level: %d", opal_cr_verbose); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Enabled: %s", opal_cr_is_enabled ? "true" : "false"); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Is a tool program: %s", opal_cr_is_tool ? "true" : "false"); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Debug SIGPIPE: %d (%s)", opal_cr_verbose, (opal_cr_debug_sigpipe ? "True" : "False")); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal: %d", opal_cr_entry_point_signal); #if OPAL_ENABLE_FT_THREAD == 1 opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Use thread: %s", opal_cr_thread_use_if_avail ? "true" : "false"); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT thread sleep: check = %d, wait = %d", opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait); /* If we have a thread, then attach the SIGPIPE signal handler there since * it is most likely to be the one that needs it. */ if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #else if( opal_cr_debug_sigpipe ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #endif #if OPAL_ENABLE_CRDEBUG == 1 opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s]\n", (MPIR_debug_with_checkpoint ? "True": "False")); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal (Debug): %d", opal_cr_debug_signal); if( SIG_ERR == signal(opal_cr_debug_signal, MPIR_checkpoint_debugger_signal_handler) ) { opal_output(opal_cr_output, "opal_cr: init: Failed to register C/R debug signal (%d)", opal_cr_debug_signal); } #endif opal_output_verbose(10, opal_cr_output, "opal_cr: init: Temp Directory: %s", opal_cr_pipe_dir); if( !opal_cr_is_tool ) { /* Register the OPAL interlevel coordination callback */ opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func); opal_cr_stall_check = false; opal_cr_currently_stalled = false; } /* End opal_cr_is_tool = true */ /* * If fault tolerance was not compiled in then * we need to make sure that the listener thread is active to tell * the tools that this is not a checkpointable job. * We don't need the CRS framework to be initalized. */ #if OPAL_ENABLE_FT_CR == 1 /* * Open the checkpoint / restart service components */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_crs_base_framework, 0))) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_open", ret ); exit_status = ret; goto cleanup; } if (OPAL_SUCCESS != (ret = opal_crs_base_select())) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_select", ret ); exit_status = ret; goto cleanup; } #endif #if OPAL_ENABLE_FT_THREAD == 1 if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) { opal_output_verbose(10, opal_cr_output, "opal_cr: init: starting the thread\n"); /* JJH: We really do need this line below since it enables * actual locks for threads. However currently the * upper layers will deadlock if it is enabled. * So hack around the problem for now, while working * on a complete solution. See ticket #2741 for more * details. * opal_set_using_threads(true); */ /* * Start the thread */ OBJ_CONSTRUCT(&opal_cr_thread, opal_thread_t); OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t); opal_cr_thread_is_done = false; opal_cr_thread_is_active = false; opal_cr_thread_in_library = false; opal_cr_thread_num_in_library = 0; opal_cr_thread.t_run = opal_cr_thread_fn; opal_cr_thread.t_arg = NULL; opal_thread_start(&opal_cr_thread); } /* End opal_cr_is_tool = true */ else { opal_output_verbose(10, opal_cr_output, "opal_cr: init: *Not* Using C/R thread\n"); } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ cleanup: return exit_status; }
int opal_cr_init(void ) { int ret, exit_status = OPAL_SUCCESS; opal_cr_coord_callback_fn_t prev_coord_func; int val; if( ++opal_cr_initalized != 1 ) { if( opal_cr_initalized < 1 ) { exit_status = OPAL_ERROR; goto cleanup; } exit_status = OPAL_SUCCESS; goto cleanup; } /* * Some startup MCA parameters */ ret = mca_base_param_reg_int_name("opal_cr", "verbose", "Verbose output level for the runtime OPAL Checkpoint/Restart functionality", false, false, 0, &val); if(0 != val) { opal_cr_output = opal_output_open(NULL); } else { opal_cr_output = -1; } opal_output_set_verbosity(opal_cr_output, val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Verbose Level: %d", val); mca_base_param_reg_int_name("ft", "cr_enabled", "Enable fault tolerance for this program", false, false, 0, &val); opal_cr_set_enabled(OPAL_INT_TO_BOOL(val)); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Enabled: %d", val); mca_base_param_reg_int_name("opal_cr", "enable_timer", "Enable Checkpoint timer (Default: Disabled)", false, false, 0, &val); opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val); mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier", "Enable Checkpoint timer Barrier (Default: Disabled)", false, false, 0, &val); if( opal_cr_timing_enabled ) { opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val); } else { opal_cr_timing_barrier_enabled = false; } mca_base_param_reg_int_name("opal_cr", "timer_target_rank", "Target Rank for the timer (Default: 0)", false, false, 0, &val); opal_cr_timing_target_rank = val; #if OPAL_ENABLE_FT_THREAD == 1 mca_base_param_reg_int_name("opal_cr", "use_thread", "Use an async thread to checkpoint this program (Default: Disabled)", false, false, 0, &val); opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Use thread: %d", val); mca_base_param_reg_int_name("opal_cr", "thread_sleep_check", "Time to sleep between checking for a checkpoint (Default: 0)", false, false, 0, &val); opal_cr_thread_sleep_check = val; mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait", "Time to sleep waiting for process to exit MPI library (Default: 0)", false, false, 0, &val); opal_cr_thread_sleep_wait = val; opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT thread sleep: check = %d, wait = %d", opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait); #endif mca_base_param_reg_int_name("opal_cr", "is_tool", "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.", false, false, 0, &val); opal_cr_is_tool = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Is a tool program: %d", val); #ifndef __WINDOWS__ mca_base_param_reg_int_name("opal_cr", "signal", "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program", false, false, SIGUSR1, &opal_cr_entry_point_signal); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal: %d", opal_cr_entry_point_signal); mca_base_param_reg_int_name("opal_cr", "debug_sigpipe", "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)", false, false, 0, &val); opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Debug SIGPIPE: %d (%s)", val, (opal_cr_debug_sigpipe ? "True" : "False")); #if OPAL_ENABLE_FT_THREAD == 1 /* If we have a thread, then attach the SIGPIPE signal handler there since * it is most likely to be the one that needs it. */ if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #else if( opal_cr_debug_sigpipe ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #endif #else opal_cr_is_tool = true; /* no support for CR on Windows yet */ #endif /* __WINDOWS__ */ mca_base_param_reg_string_name("opal_cr", "tmp_dir", "Temporary directory to place rendezvous files for a checkpoint", false, false, "/tmp", &opal_cr_pipe_dir); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Temp Directory: %s", opal_cr_pipe_dir); if( !opal_cr_is_tool ) { /* Register the OPAL interlevel coordination callback */ opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func); opal_cr_stall_check = false; opal_cr_currently_stalled = false; } /* End opal_cr_is_tool = true */ /* * If fault tolerance was not compiled in then * we need to make sure that the listener thread is active to tell * the tools that this is not a checkpointable job. * We don't need the CRS framework to be initalized. */ #if OPAL_ENABLE_FT == 1 /* * Open the checkpoint / restart service components */ if (OPAL_SUCCESS != (ret = opal_crs_base_open())) { opal_output(opal_cr_output, "opal_cr: init: opal_crs_base_open Failed to open. (%d)\n", ret); exit_status = ret; goto cleanup; } if (OPAL_SUCCESS != (ret = opal_crs_base_select())) { opal_output(opal_cr_output, "opal_cr: init: opal_crs_base_select Failed. (%d)\n", ret); exit_status = ret; goto cleanup; } #endif #if OPAL_ENABLE_FT_THREAD == 1 if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) { opal_output_verbose(10, opal_cr_output, "opal_cr: init: starting the thread\n"); opal_set_using_threads(true); /* * Start the thread */ OBJ_CONSTRUCT(&opal_cr_thread, opal_thread_t); OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t); opal_cr_thread_is_done = false; opal_cr_thread_is_active = false; opal_cr_thread_in_library = false; opal_cr_thread_num_in_library = 0; opal_cr_thread.t_run = opal_cr_thread_fn; opal_cr_thread.t_arg = NULL; opal_thread_start(&opal_cr_thread); } /* End opal_cr_is_tool = true */ else { opal_output_verbose(10, opal_cr_output, "opal_cr: init: *Not* Using C/R thread\n"); } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ cleanup: return exit_status; }
static int tool_init(void) { int ret = ORTE_ERROR; char *error = NULL; opal_buffer_t buf, *clusterbuf, *uribuf; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; opal_list_t config; orcm_scheduler_t *scheduler; orcm_node_t *mynode=NULL; int32_t n; if (initialized) { return ORCM_SUCCESS; } initialized = true; /* Initialize the ORTE data type support */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_std_prolog"; goto error; } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* create a job tracker for the daemons */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = 0; ORTE_PROC_MY_NAME->jobid = 0; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* read the site configuration */ OBJ_CONSTRUCT(&config, opal_list_t); if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) { error = "getting config"; goto error; } /* define the cluster and collect contact info for all * aggregators - we'll need to know how to talk to any * of them in case of failures */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config, &mynode, &orte_process_info.num_procs, &buf))) { OBJ_DESTRUCT(&buf); error = "define system"; goto error; } /* define a name for myself */ if (ORTE_SUCCESS != (ret = orte_plm_base_set_hnp_name())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_set_hnp_name"; goto error; } /* define a node and proc object for ourselves as some parts * of ORTE and ORCM require it */ if (NULL == (node = OBJ_NEW(orte_node_t))) { ret = ORTE_ERR_OUT_OF_RESOURCE; error = "out of memory"; goto error; } node->name = strdup(orte_process_info.nodename); opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); if (NULL == (proc = OBJ_NEW(orte_proc_t))) { ret = ORTE_ERR_OUT_OF_RESOURCE; error = "out of memory"; goto error; } proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; OBJ_RETAIN(proc); node->daemon = proc; OBJ_RETAIN(node); proc->node = node; opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc); /* For now, we only support a single scheduler daemon in the system. * This *may* change someday in the future */ scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers); ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid; ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid; /* register the ORTE-level params at this time now that the * config has had a chance to push things into the environ */ if (ORTE_SUCCESS != (ret = orte_register_params())) { OBJ_DESTRUCT(&buf); error = "orte_register_params"; goto error; } /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); signals_set = true; /* open and select the pstat framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_errmgr_base_open"; goto error; } /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_select"; goto error; } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orcm_db_base_open"; goto error; } /* always restrict daemons to local database components */ if (ORTE_SUCCESS != (ret = orcm_db_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orcm_db_base_select"; goto error; } /* datastore - ensure we don't pickup the pmi component, but * don't override anything set by user */ if (NULL == getenv("OMPI_MCA_dstore")) { putenv("OMPI_MCA_dstore=^pmi"); } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_select"; goto error; } /* create the handles */ if (0 > (opal_dstore_peer = opal_dstore.open("PEER"))) { error = "opal dstore global"; ret = ORTE_ERR_FATAL; goto error; } if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL"))) { error = "opal dstore internal"; ret = ORTE_ERR_FATAL; goto error; } if (0 > (opal_dstore_nonpeer = opal_dstore.open("NONPEER"))) { error = "opal dstore nonpeer"; ret = ORTE_ERR_FATAL; goto error; } /* initialize the nidmaps */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_util_nidmap_init"; goto error; } /* extract the cluster description and setup the routed info - the orcm routed component * will know what to do. */ n = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "extract cluster buf"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); OBJ_RELEASE(clusterbuf); error = "orte_routed.init_routes"; goto error; } OBJ_RELEASE(clusterbuf); /* extract the uri buffer and load the hash tables */ n = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "extract uri buffer"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); OBJ_RELEASE(uribuf); error = "load hash tables"; goto error; } OBJ_DESTRUCT(&buf); OBJ_RELEASE(uribuf); /* construct the thread object */ OBJ_CONSTRUCT(&progress_thread, opal_thread_t); /* fork off a thread to progress it */ progress_thread.t_run = progress_thread_engine; progress_thread_running = true; if (OPAL_SUCCESS != (ret = opal_thread_start(&progress_thread))) { error = "progress thread start"; progress_thread_running = false; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ opal_cr_set_enabled(false); if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the DFS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_select"; goto error; } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ORTE_ERR_SILENT; }
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) { int ret; char *error = NULL; if (0 < orte_initialized) { /* track number of times we have been called */ orte_initialized++; return ORTE_SUCCESS; } orte_initialized++; /* initialize the opal layer */ if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) { error = "opal_init"; goto error; } /* ensure we know the type of proc for when we finalize */ orte_process_info.proc_type = flags; /* setup the locks */ if (ORTE_SUCCESS != (ret = orte_locks_init())) { error = "orte_locks_init"; goto error; } /* Register all MCA Params */ if (ORTE_SUCCESS != (ret = orte_register_params())) { error = "orte_register_params"; goto error; } /* setup the orte_show_help system */ if (ORTE_SUCCESS != (ret = orte_show_help_init())) { error = "opal_output_init"; goto error; } /* register handler for errnum -> string conversion */ opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str); /* Ensure the rest of the process info structure is initialized */ if (ORTE_SUCCESS != (ret = orte_proc_info())) { error = "orte_proc_info"; goto error; } /* open the ESS and select the correct module for this environment */ if (ORTE_SUCCESS != (ret = orte_ess_base_open())) { error = "orte_ess_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ess_base_select())) { error = "orte_ess_base_select"; goto error; } #if ORTE_ENABLE_PROGRESS_THREADS #if OPAL_EVENT_HAVE_THREAD_SUPPORT /* get a separate orte event base */ orte_event_base = opal_event_base_create(); /* construct the thread object */ OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t); /* fork off a thread to progress it */ orte_progress_thread.t_run = orte_progress_thread_engine; if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) { error = "orte progress thread start"; goto error; } #else error = "event thread support is not configured"; ret = ORTE_ERROR; goto error; #endif #else /* set the event base to the opal one */ orte_event_base = opal_event_base; #endif /* initialize the RTE for this environment */ if (ORTE_SUCCESS != (ret = orte_ess.init())) { error = "orte_ess_init"; goto error; } /* All done */ return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
int opal_cr_init(void ) { int ret, exit_status = OPAL_SUCCESS; opal_cr_coord_callback_fn_t prev_coord_func; int val, t; if( ++opal_cr_initalized != 1 ) { if( opal_cr_initalized < 1 ) { exit_status = OPAL_ERROR; goto cleanup; } exit_status = OPAL_SUCCESS; goto cleanup; } /* * Some startup MCA parameters */ ret = mca_base_param_reg_int_name("opal_cr", "verbose", "Verbose output level for the runtime OPAL Checkpoint/Restart functionality", false, false, 0, &val); if(0 != val) { opal_cr_output = opal_output_open(NULL); } else { opal_cr_output = -1; } opal_output_set_verbosity(opal_cr_output, val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Verbose Level: %d", val); mca_base_param_reg_int_name("ft", "cr_enabled", "Enable fault tolerance for this program", false, false, 0, &val); opal_cr_set_enabled(OPAL_INT_TO_BOOL(val)); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Enabled: %d", val); mca_base_param_reg_int_name("opal_cr", "enable_timer", "Enable Checkpoint timer (Default: Disabled)", false, false, 0, &val); opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val); mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier", "Enable Checkpoint timer Barrier (Default: Disabled)", false, false, 0, &val); if( opal_cr_timing_enabled ) { opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val); } else { opal_cr_timing_barrier_enabled = false; } mca_base_param_reg_int_name("opal_cr", "timer_target_rank", "Target Rank for the timer (Default: 0)", false, false, 0, &val); opal_cr_timing_target_rank = val; #if OPAL_ENABLE_FT_THREAD == 1 mca_base_param_reg_int_name("opal_cr", "use_thread", "Use an async thread to checkpoint this program (Default: Disabled)", false, false, 0, &val); opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT Use thread: %d", val); mca_base_param_reg_int_name("opal_cr", "thread_sleep_check", "Time to sleep between checking for a checkpoint (Default: 0)", false, false, 0, &val); opal_cr_thread_sleep_check = val; mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait", "Time to sleep waiting for process to exit MPI library (Default: 1000)", false, false, 1000, &val); opal_cr_thread_sleep_wait = val; opal_output_verbose(10, opal_cr_output, "opal_cr: init: FT thread sleep: check = %d, wait = %d", opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait); #endif mca_base_param_reg_int_name("opal_cr", "is_tool", "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.", false, false, 0, &val); opal_cr_is_tool = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Is a tool program: %d", val); #if OPAL_ENABLE_CRDEBUG == 1 mca_base_param_reg_int_name("opal_cr", "enable_crdebug", "Enable checkpoint/restart debugging", false, false, 0, &val); MPIR_debug_with_checkpoint = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: C/R Debugging Enabled [%s]\n", (MPIR_debug_with_checkpoint ? "True": "False")); #endif #ifndef __WINDOWS__ mca_base_param_reg_int_name("opal_cr", "signal", "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program", false, false, SIGUSR1, &opal_cr_entry_point_signal); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal: %d", opal_cr_entry_point_signal); mca_base_param_reg_int_name("opal_cr", "debug_sigpipe", "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)", false, false, 0, &val); opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Debug SIGPIPE: %d (%s)", val, (opal_cr_debug_sigpipe ? "True" : "False")); #if OPAL_ENABLE_FT_THREAD == 1 /* If we have a thread, then attach the SIGPIPE signal handler there since * it is most likely to be the one that needs it. */ if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #else if( opal_cr_debug_sigpipe ) { if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) { ; } } #endif #else opal_cr_is_tool = true; /* no support for CR on Windows yet */ #endif /* __WINDOWS__ */ #if OPAL_ENABLE_CRDEBUG == 1 opal_cr_debug_num_free_threads = 3; opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads ); for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) { opal_cr_debug_free_threads[t] = NULL; } mca_base_param_reg_int_name("opal_cr", "crdebug_signal", "Checkpoint/Restart signal used to hold threads when debugging", false, false, SIGTSTP, &opal_cr_debug_signal); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Checkpoint Signal (Debug): %d", opal_cr_debug_signal); if( SIG_ERR == signal(opal_cr_debug_signal, MPIR_checkpoint_debugger_signal_handler) ) { opal_output(opal_cr_output, "opal_cr: init: Failed to register C/R debug signal (%d)", opal_cr_debug_signal); } #else /* Silence a compiler warning */ t = 0; #endif mca_base_param_reg_string_name("opal_cr", "tmp_dir", "Temporary directory to place rendezvous files for a checkpoint", false, false, opal_tmp_directory(), &opal_cr_pipe_dir); opal_output_verbose(10, opal_cr_output, "opal_cr: init: Temp Directory: %s", opal_cr_pipe_dir); if( !opal_cr_is_tool ) { /* Register the OPAL interlevel coordination callback */ opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func); opal_cr_stall_check = false; opal_cr_currently_stalled = false; } /* End opal_cr_is_tool = true */ /* * If fault tolerance was not compiled in then * we need to make sure that the listener thread is active to tell * the tools that this is not a checkpointable job. * We don't need the CRS framework to be initalized. */ #if OPAL_ENABLE_FT_CR == 1 /* * Open the checkpoint / restart service components */ if (OPAL_SUCCESS != (ret = opal_crs_base_open())) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_open", ret ); exit_status = ret; goto cleanup; } if (OPAL_SUCCESS != (ret = opal_crs_base_select())) { opal_show_help( "help-opal-runtime.txt", "opal_cr_init:no-crs", true, "opal_crs_base_select", ret ); exit_status = ret; goto cleanup; } #endif #if OPAL_ENABLE_FT_THREAD == 1 if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) { opal_output_verbose(10, opal_cr_output, "opal_cr: init: starting the thread\n"); /* JJH: We really do need this line below since it enables * actual locks for threads. However currently the * upper layers will deadlock if it is enabled. * So hack around the problem for now, while working * on a complete solution. See ticket #2741 for more * details. * opal_set_using_threads(true); */ /* * Start the thread */ OBJ_CONSTRUCT(&opal_cr_thread, opal_thread_t); OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t); opal_cr_thread_is_done = false; opal_cr_thread_is_active = false; opal_cr_thread_in_library = false; opal_cr_thread_num_in_library = 0; opal_cr_thread.t_run = opal_cr_thread_fn; opal_cr_thread.t_arg = NULL; opal_thread_start(&opal_cr_thread); } /* End opal_cr_is_tool = true */ else { opal_output_verbose(10, opal_cr_output, "opal_cr: init: *Not* Using C/R thread\n"); } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ cleanup: return exit_status; }
int main(int argc, char **argv) { char byte='a'; struct timespec tp= {0, 100}; int count=0; foo_caddy_t *foo; /* Initialize the event library */ opal_init(&argc, &argv); /* setup for threads */ opal_event_use_threads(); /* create a new base */ my_base = orte_event_base_create(); /* launch a progress thread on that base*/ pipe(progress_thread_pipe); OBJ_CONSTRUCT(&lock, opal_mutex_t); OBJ_CONSTRUCT(&cond, opal_condition_t); OBJ_CONSTRUCT(&progress_thread, opal_thread_t); progress_thread.t_run = progress_engine; if (OPAL_SUCCESS != opal_thread_start(&progress_thread)) { fprintf(stderr, "Unable to start progress thread\n"); orte_event_base_finalize(my_base); exit(1); } /* wait a little while - reflects reality in an async system */ while (count < 100) { nanosleep(&tp, NULL); count++; } count=0; /* make a dummy event */ fprintf(stderr, "activating the write_event"); foo = OBJ_NEW(foo_caddy_t); opal_event_set(my_base, &foo->write_event, -1, 0, send_handler, foo); /* activate it. */ opal_event_active(&foo->write_event, EV_WRITE, 1); /* wait for it to trigger */ while (!fd_written && count < 1000) { if (0 == (count % 100)) { fprintf(stderr, "Waiting...\n"); } nanosleep(&tp, NULL); count++; } /* stop the thread */ OPAL_ACQUIRE_THREAD(&lock, &cond, &active); progress_thread_stop = true; OPAL_RELEASE_THREAD(&lock, &cond, &active); opal_fd_write(progress_thread_pipe[1], 1, &byte); opal_thread_join(&progress_thread, NULL); /* release the base */ fprintf(stderr, "Cleaning up\n"); opal_finalize(); fprintf(stderr, "Cleanup completed\n"); return 0; }