int opal_finalize(void) { if( --opal_initialized != 0 ) { if( opal_initialized < 0 ) { return OPAL_ERROR; } return OPAL_SUCCESS; } opal_progress_finalize(); opal_event_base_close(); /* close high resolution timers */ (void) mca_base_framework_close(&opal_timer_base_framework); (void) mca_base_framework_close(&opal_backtrace_base_framework); /* close hwloc */ opal_hwloc_base_close(); /* cleanup the main thread specific stuff */ opal_tsd_keys_destruct(); /* finalize util code */ opal_finalize_util(); return OPAL_SUCCESS; }
int opal_finalize(void) { if( --opal_initialized != 0 ) { if( opal_initialized < 0 ) { return OPAL_ERROR; } return OPAL_SUCCESS; } opal_progress_finalize(); /* close the checkpoint and restart service */ opal_cr_finalize(); /* close the security framework */ (void) mca_base_framework_close(&opal_sec_base_framework); #if OPAL_ENABLE_FT_CR == 1 (void) mca_base_framework_close(&opal_compress_base_framework); #endif (void) mca_base_framework_close(&opal_event_base_framework); /* close high resolution timers */ (void) mca_base_framework_close(&opal_timer_base_framework); (void) mca_base_framework_close(&opal_backtrace_base_framework); (void) mca_base_framework_close(&opal_memchecker_base_framework); /* close the memory manager components. Registered hooks can still be fired any time between now and the call to opal_mem_free_finalize(), and callbacks from the memory manager hooks to the bowels of the mem_free code can still occur any time between now and end of application (even post main()!) */ (void) mca_base_framework_close(&opal_memory_base_framework); /* close the memcpy framework */ (void) mca_base_framework_close(&opal_memcpy_base_framework); /* finalize the memory manager / tracker */ opal_mem_hooks_finalize(); /* close the hwloc framework */ (void) mca_base_framework_close(&opal_hwloc_base_framework); /* close the shmem framework */ (void) mca_base_framework_close(&opal_shmem_base_framework); /* close the sec framework */ (void) mca_base_framework_close(&opal_sec_base_framework); /* finalize util code */ opal_finalize_util(); return OPAL_SUCCESS; }
int opal_finalize(void) { if( --opal_initialized != 0 ) { if( opal_initialized < 0 ) { return OPAL_ERROR; } return OPAL_SUCCESS; } opal_progress_finalize(); /* close the checkpoint and restart service */ opal_cr_finalize(); /* close the security framework */ (void) mca_base_framework_close(&opal_sec_base_framework); #if OPAL_ENABLE_FT_CR == 1 (void) mca_base_framework_close(&opal_compress_base_framework); #endif (void) mca_base_framework_close(&opal_event_base_framework); /* close high resolution timers */ (void) mca_base_framework_close(&opal_timer_base_framework); (void) mca_base_framework_close(&opal_backtrace_base_framework); (void) mca_base_framework_close(&opal_memchecker_base_framework); (void) mca_base_framework_close(&opal_patcher_base_framework); /* close the memcpy framework */ (void) mca_base_framework_close(&opal_memcpy_base_framework); /* finalize the memory manager / tracker */ opal_mem_hooks_finalize(); /* close the hwloc framework */ (void) mca_base_framework_close(&opal_hwloc_base_framework); /* close the shmem framework */ (void) mca_base_framework_close(&opal_shmem_base_framework); /* close the sec framework */ (void) mca_base_framework_close(&opal_sec_base_framework); /* finalize util code */ opal_finalize_util(); return OPAL_SUCCESS; }
int opal_finalize(void) { if( --opal_initialized != 0 ) { if( opal_initialized < 0 ) { return OPAL_ERROR; } return OPAL_SUCCESS; } /* close the checkpoint and restart service */ opal_cr_finalize(); opal_progress_finalize(); opal_event_fini(); /* close high resolution timers */ opal_timer_base_close(); opal_backtrace_base_close(); /* close the memory manager components. Registered hooks can still be fired any time between now and the call to opal_mem_free_finalize(), and callbacks from the memory manager hooks to the bowels of the mem_free code can still occur any time between now and end of application (even post main()!) */ opal_memory_base_close(); /* finalize the memory manager / tracker */ opal_mem_hooks_finalize(); /* close the carto framework */ opal_carto_base_close(); /* close the processor affinity base */ opal_paffinity_base_close(); /* close the memcpy base */ opal_memcpy_base_close(); /* finalize the mca */ mca_base_close(); /* finalize util code */ opal_finalize_util(); return OPAL_SUCCESS; }
int ompi_mpi_finalize(void) { int ret; static int32_t finalize_has_already_started = 0; opal_list_item_t *item; struct timeval ompistart, ompistop; ompi_rte_collective_t *coll; ompi_proc_t** procs; size_t nprocs; /* Be a bit social if an erroneous program calls MPI_FINALIZE in two different threads, otherwise we may deadlock in ompi_comm_free() (or run into other nasty lions, tigers, or bears) */ if (! opal_atomic_cmpset_32(&finalize_has_already_started, 0, 1)) { /* Note that if we're already finalized, we cannot raise an MPI exception. The best that we can do is write something to stderr. */ char hostname[MAXHOSTNAMELEN]; pid_t pid = getpid(); gethostname(hostname, sizeof(hostname)); opal_show_help("help-mpi-runtime.txt", "mpi_finalize:invoked_multiple_times", true, hostname, pid); return MPI_ERR_OTHER; } ompi_mpiext_fini(); /* Per MPI-2:4.8, we have to free MPI_COMM_SELF before doing anything else in MPI_FINALIZE (to include setting up such that MPI_FINALIZED will return true). */ if (NULL != ompi_mpi_comm_self.comm.c_keyhash) { ompi_attr_delete_all(COMM_ATTR, &ompi_mpi_comm_self, ompi_mpi_comm_self.comm.c_keyhash); OBJ_RELEASE(ompi_mpi_comm_self.comm.c_keyhash); ompi_mpi_comm_self.comm.c_keyhash = NULL; } /* Proceed with MPI_FINALIZE */ ompi_mpi_finalized = true; /* As finalize is the last legal MPI call, we are allowed to force the release * of the user buffer used for bsend, before going anywhere further. */ (void)mca_pml_base_bsend_detach(NULL, NULL); nprocs = 0; procs = ompi_proc_all(&nprocs); MCA_PML_CALL(del_procs(procs, nprocs)); free(procs); #if OMPI_ENABLE_PROGRESS_THREADS == 0 opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK); #endif /* Redo ORTE calling opal_progress_event_users_increment() during MPI lifetime, to get better latency when not using TCP */ opal_progress_event_users_increment(); /* check to see if we want timing information */ if (ompi_enable_timing != 0 && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistart, NULL); } /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across *all* connected processes. This only means that all processes have to call it. It does *not* mean that all connected processes need to synchronize (either directly or indirectly). For example, it is quite easy to construct complicated scenarios where one job is "connected" to another job via transitivity, but have no direct knowledge of each other. Consider the following case: job A spawns job B, and job B later spawns job C. A "connectedness" graph looks something like this: A <--> B <--> C So what are we *supposed* to do in this case? If job A is still connected to B when it calls FINALIZE, should it block until jobs B and C also call FINALIZE? After lengthy discussions many times over the course of this project, the issue was finally decided at the Louisville Feb 2009 meeting: no. Rationale: - "Collective" does not mean synchronizing. It only means that every process call it. Hence, in this scenario, every process in A, B, and C must call FINALIZE. - KEY POINT: if A calls FINALIZE, then it is erroneous for B or C to try to communicate with A again. - Hence, OMPI is *correct* to only effect a barrier across each jobs' MPI_COMM_WORLD before exiting. Specifically, if A calls FINALIZE long before B or C, it's *correct* if A exits at any time (and doesn't notify B or C that it is exiting). - Arguably, if B or C do try to communicate with the now-gone A, OMPI should try to print a nice error ("you tried to communicate with a job that is already gone...") instead of segv or other Badness. However, that is an *extremely* difficult problem -- sure, it's easy for A to tell B that it is finalizing, but how can A tell C? A doesn't even know about C. You'd need to construct a "connected" graph in a distributed fashion, which is fraught with race conditions, etc. Hence, our conclusion is: OMPI is *correct* in its current behavior (of only doing a barrier across its own COMM_WORLD) before exiting. Any problems that occur are as a result of erroneous MPI applications. We *could* tighten up the erroneous cases and ensure that we print nice error messages / don't crash, but that is such a difficult problem that we decided we have many other, much higher priority issues to handle that deal with non-erroneous cases. */ /* wait for everyone to reach this point This is a grpcomm barrier instead of an MPI barrier because an MPI barrier doesn't ensure that all messages have been transmitted before exiting, so the possibility of a stranded message exists. */ coll = OBJ_NEW(ompi_rte_collective_t); coll->id = ompi_process_info.peer_fini_barrier; coll->active = true; if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) { OMPI_ERROR_LOG(ret); return ret; } /* wait for barrier to complete */ OMPI_LAZY_WAIT_FOR_COMPLETION(coll->active); OBJ_RELEASE(coll); /* check for timing request - get stop time and report elapsed time if so */ if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_finalize[%ld]: time to execute barrier %ld usec", (long)OMPI_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); } /* * Shutdown the Checkpoint/Restart Mech. */ if (OMPI_SUCCESS != (ret = ompi_cr_finalize())) { OMPI_ERROR_LOG(ret); } /* Shut down any bindings-specific issues: C++, F77, F90 */ /* Remove all memory associated by MPI_REGISTER_DATAREP (per MPI-2:9.5.3, there is no way for an MPI application to *un*register datareps, but we don't want the OMPI layer causing memory leaks). */ while (NULL != (item = opal_list_remove_first(&ompi_registered_datareps))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&ompi_registered_datareps); /* Remove all F90 types from the hash tables. As the OBJ_DESTRUCT will * call a special destructor able to release predefined types, we can * simply call the OBJ_DESTRUCT on the hash table and all memory will * be correctly released. */ OBJ_DESTRUCT( &ompi_mpi_f90_integer_hashtable ); OBJ_DESTRUCT( &ompi_mpi_f90_real_hashtable ); OBJ_DESTRUCT( &ompi_mpi_f90_complex_hashtable ); /* Free communication objects */ /* free file resources */ if (OMPI_SUCCESS != (ret = ompi_file_finalize())) { return ret; } /* free window resources */ if (OMPI_SUCCESS != (ret = ompi_win_finalize())) { return ret; } if (OMPI_SUCCESS != (ret = ompi_osc_base_finalize())) { return ret; } /* free pml resource */ if(OMPI_SUCCESS != (ret = mca_pml_base_finalize())) { return ret; } /* free communicator resources */ if (OMPI_SUCCESS != (ret = ompi_comm_finalize())) { return ret; } /* free requests */ if (OMPI_SUCCESS != (ret = ompi_request_finalize())) { return ret; } if (OMPI_SUCCESS != (ret = ompi_message_finalize())) { return ret; } /* If requested, print out a list of memory allocated by ALLOC_MEM but not freed by FREE_MEM */ if (0 != ompi_debug_show_mpi_alloc_mem_leaks) { mca_mpool_base_tree_print(); } /* Now that all MPI objects dealing with communications are gone, shut down MCA types having to do with communications */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pml_base_framework) ) ) { OMPI_ERROR_LOG(ret); return ret; } /* shut down buffered send code */ mca_pml_base_bsend_fini(); #if OPAL_ENABLE_FT_CR == 1 /* * Shutdown the CRCP Framework, must happen after PML shutdown */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_crcp_base_framework) ) ) { OMPI_ERROR_LOG(ret); return ret; } #endif /* Free secondary resources */ /* free attr resources */ if (OMPI_SUCCESS != (ret = ompi_attr_finalize())) { return ret; } /* free group resources */ if (OMPI_SUCCESS != (ret = ompi_group_finalize())) { return ret; } /* free proc resources */ if ( OMPI_SUCCESS != (ret = ompi_proc_finalize())) { return ret; } /* finalize the pubsub functions */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pubsub_base_framework) ) ) { return ret; } /* finalize the DPM framework */ if ( OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_dpm_base_framework))) { return ret; } /* free internal error resources */ if (OMPI_SUCCESS != (ret = ompi_errcode_intern_finalize())) { return ret; } /* free error code resources */ if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_finalize())) { return ret; } /* free errhandler resources */ if (OMPI_SUCCESS != (ret = ompi_errhandler_finalize())) { return ret; } /* Free all other resources */ /* free op resources */ if (OMPI_SUCCESS != (ret = ompi_op_finalize())) { return ret; } /* free ddt resources */ if (OMPI_SUCCESS != (ret = ompi_datatype_finalize())) { return ret; } /* free info resources */ if (OMPI_SUCCESS != (ret = ompi_info_finalize())) { return ret; } /* Close down MCA modules */ /* io is opened lazily, so it's only necessary to close it if it was actually opened */ if (0 < ompi_io_base_framework.framework_refcnt) { /* May have been "opened" multiple times. We want it closed now */ ompi_io_base_framework.framework_refcnt = 1; if (OMPI_SUCCESS != mca_base_framework_close(&ompi_io_base_framework)) { return ret; } } (void) mca_base_framework_close(&ompi_topo_base_framework); if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_osc_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_coll_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bml_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_mpool_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rcache_base_framework))) { return ret; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_allocator_base_framework))) { return ret; } if (NULL != ompi_mpi_main_thread) { OBJ_RELEASE(ompi_mpi_main_thread); ompi_mpi_main_thread = NULL; } /* Leave the RTE */ if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) { return ret; } /* now close the rte framework */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rte_base_framework) ) ) { OMPI_ERROR_LOG(ret); return ret; } if (OPAL_SUCCESS != (ret = opal_finalize_util())) { return ret; } /* All done */ return MPI_SUCCESS; }
int orte_daemon(int argc, char *argv[]) { int ret = 0; opal_cmd_line_t *cmd_line = NULL; char *rml_uri; int i; opal_buffer_t *buffer; char hostname[100]; char *tmp_env_var = NULL; /* initialize the globals */ memset(&orted_globals, 0, sizeof(orted_globals)); /* initialize the singleton died pipe to an illegal value so we can detect it was set */ orted_globals.singleton_died_pipe = -1; /* init the failure orted vpid to an invalid value */ orted_globals.fail = ORTE_VPID_INVALID; /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) { OBJ_RELEASE(cmd_line); exit(1); } mca_base_cmd_line_setup(cmd_line); if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, argc, argv))) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args); free(args); OBJ_RELEASE(cmd_line); return ret; } /* * Since this process can now handle MCA/GMCA parameters, make sure to * process them. */ mca_base_cmd_line_process_args(cmd_line, &environ, &environ); /* Ensure that enough of OPAL is setup for us to be able to run */ /* * NOTE: (JJH) * We need to allow 'mca_base_cmd_line_process_args()' to process command * line arguments *before* calling opal_init_util() since the command * line could contain MCA parameters that affect the way opal_init_util() * functions. AMCA parameters are one such option normally received on the * command line that affect the way opal_init_util() behaves. * It is "safe" to call mca_base_cmd_line_process_args() before * opal_init_util() since mca_base_cmd_line_process_args() does *not* * depend upon opal_init_util() functionality. */ if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) { fprintf(stderr, "OPAL failed to initialize -- orted aborting\n"); exit(1); } /* save the environment for launch purposes. This MUST be * done so that we can pass it to any local procs we * spawn - otherwise, those local procs won't see any * non-MCA envars that were set in the enviro when the * orted was executed - e.g., by .csh */ orte_launch_environ = opal_argv_copy(environ); /* purge any ess flag set in the environ when we were launched */ opal_unsetenv("OMPI_MCA_ess", &orte_launch_environ); /* if orte_daemon_debug is set, let someone know we are alive right * away just in case we have a problem along the way */ if (orted_globals.debug) { gethostname(hostname, 100); fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname); } /* check for help request */ if (orted_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); orte_show_help("help-orted.txt", "orted:usage", false, argv[0], args); free(args); return 1; } #if defined(HAVE_SETSID) && !defined(__WINDOWS__) /* see if we were directed to separate from current session */ if (orted_globals.set_sid) { setsid(); } #endif /* !defined(__WINDOWS__) */ /* see if they want us to spin until they can connect a debugger to us */ i=0; while (orted_spin_flag) { i++; if (1000 < i) i=0; } #if OPAL_ENABLE_FT_CR == 1 /* Mark as a tool program */ tmp_env_var = mca_base_param_env_var("opal_cr_is_tool"); opal_setenv(tmp_env_var, "1", true, &environ); free(tmp_env_var); #endif tmp_env_var = NULL; /* Silence compiler warning */ /* if mapreduce set, flag it */ if (orted_globals.mapreduce) { orte_map_reduce = true; } /* Set the flag telling OpenRTE that I am NOT a * singleton, but am "infrastructure" - prevents setting * up incorrect infrastructure that only a singleton would * require. */ if (orted_globals.hnp) { if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) { ORTE_ERROR_LOG(ret); return ret; } } else { if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) { ORTE_ERROR_LOG(ret); return ret; } } /* finalize the OPAL utils. As they are opened again from orte_init->opal_init * we continue to have a reference count on them. So we have to finalize them twice... */ opal_finalize_util(); if ((int)ORTE_VPID_INVALID != orted_globals.fail) { orted_globals.abort=false; /* some vpid was ordered to fail. The value can be positive * or negative, depending upon the desired method for failure, * so need to check both here */ if (0 > orted_globals.fail) { orted_globals.fail = -1*orted_globals.fail; orted_globals.abort = true; } /* are we the specified vpid? */ if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) { /* if the user specified we delay, then setup a timer * and have it kill us */ if (0 < orted_globals.fail_delay) { ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI); } else { opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orted_globals.abort ? "abort" : "abnormal termination"); /* do -not- call finalize as this will send a message to the HNP * indicating clean termination! Instead, just forcibly cleanup * the local session_dir tree and exit */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* if we were ordered to abort, do so */ if (orted_globals.abort) { abort(); } /* otherwise, return with non-zero status */ ret = ORTE_ERROR_DEFAULT_EXIT_CODE; goto DONE; } } } /* detach from controlling terminal * otherwise, remain attached so output can get to us */ if(!orte_debug_flag && !orte_debug_daemons_flag && orted_globals.daemonize) { opal_daemon_init(NULL); } /* insert our contact info into our process_info struct so we * have it for later use and set the local daemon field to our name */ orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; /* if I am also the hnp, then update that contact info field too */ if (ORTE_PROC_IS_HNP) { orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; } /* setup the primary daemon command receive function */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); goto DONE; } /* output a message indicating we are alive, our name, and our pid * for debugging purposes */ if (orte_debug_daemons_flag) { fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid, orte_process_info.nodename); } /* We actually do *not* want the orted to voluntarily yield() the processor more than necessary. The orted already blocks when it is doing nothing, so it doesn't use any more CPU cycles than it should; but when it *is* doing something, we do not want it to be unnecessarily delayed because it voluntarily yielded the processor in the middle of its work. For example: when a message arrives at the orted, we want the OS to wake up the orted in a timely fashion (which most OS's seem good about doing) and then we want the orted to process the message as fast as possible. If the orted yields and lets aggressive MPI applications get the processor back, it may be a long time before the OS schedules the orted to run again (particularly if there is no IO event to wake it up). Hence, routed OOB messages (for example) may be significantly delayed before being delivered to MPI processes, which can be problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); /* Change the default behavior of libevent such that we want to continually block rather than blocking for the default timeout and then looping around the progress engine again. There should be nothing in the orted that cannot block in libevent until "something" happens (i.e., there's no need to keep cycling through progress because the only things that should happen will happen in libevent). This is a minor optimization, but what the heck... :-) */ opal_progress_set_event_flag(OPAL_EVLOOP_ONCE); /* if requested, obtain and report a new process name and my uri to the indicated pipe */ if (orted_globals.uri_pipe > 0) { orte_job_t *jdata; orte_proc_t *proc; orte_node_t *node; orte_app_context_t *app; char *tmp, *nptr, *sysinfo; int32_t ljob; /* setup the singleton's job */ jdata = OBJ_NEW(orte_job_t); orte_plm_base_create_jobid(jdata); ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); /* must create a map for it (even though it has no * info in it) so that the job info will be picked * up in subsequent pidmaps or other daemons won't * know how to route */ jdata->map = OBJ_NEW(orte_job_map_t); /* setup an app_context for the singleton */ app = OBJ_NEW(orte_app_context_t); app->app = strdup("singleton"); app->num_procs = 1; opal_pointer_array_add(jdata->apps, app); #if 0 /* run our local allocator to read the available * allocation in case this singleton decides to * comm_spawn other procs */ if (ORTE_SUCCESS != (ret = orte_ras.allocate(jdata))) { ORTE_ERROR_LOG(ret); /* don't quit as this would cause the singleton * to hang! */ } #endif /* setup a proc object for the singleton - since we * -must- be the HNP, and therefore we stored our * node on the global node pool, and since the singleton * -must- be on the same node as us, indicate that */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = jdata->jobid; proc->name.vpid = 0; proc->alive = true; proc->state = ORTE_PROC_STATE_RUNNING; proc->app_idx = 0; /* obviously, it is on my node */ node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); proc->node = node; OBJ_RETAIN(node); /* keep accounting straight */ opal_pointer_array_add(jdata->procs, proc); jdata->num_procs = 1; /* and obviously it is one of my local procs */ OBJ_RETAIN(proc); opal_pointer_array_add(orte_local_children, proc); jdata->num_local_procs = 1; /* set the trivial */ proc->local_rank = 0; proc->node_rank = 0; proc->app_rank = 0; proc->state = ORTE_PROC_STATE_RUNNING; proc->alive = true; proc->app_idx = 0; proc->local_proc = true; #if OPAL_HAVE_HWLOC proc->bind_idx = 0; #endif /* the singleton will use the first three collectives * for its modex/barriers */ orte_grpcomm_base.coll_id += 3; /* need to setup a pidmap for it */ jdata->pmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(jdata->pmap))) { ORTE_ERROR_LOG(ret); goto DONE; } /* if we don't yet have a daemon map, then we have to generate one * to pass back to it */ if (NULL == orte_odls_globals.dmap) { orte_odls_globals.dmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t)); /* construct a nodemap */ if (ORTE_SUCCESS != (ret = orte_util_encode_nodemap(orte_odls_globals.dmap))) { ORTE_ERROR_LOG(ret); goto DONE; } } /* create a string that contains our uri + the singleton's name + sysinfo */ orte_util_convert_process_name_to_string(&nptr, &proc->name); orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model); asprintf(&tmp, "%s[%s][%s]", orte_process_info.my_daemon_uri, nptr, sysinfo); free(nptr); free(sysinfo); /* pass that info to the singleton */ #ifndef __WINDOWS__ write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */ #else send(orted_globals.uri_pipe, tmp, strlen(tmp)+1, 0); /* need to add 1 to get the NULL */ #endif /* cleanup */ free(tmp); } /* if we were given a pipe to monitor for singleton termination, set that up */ if (orted_globals.singleton_died_pipe > 0) { /* register shutdown handler */ pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t)); opal_event_set(orte_event_base, pipe_handler, orted_globals.singleton_died_pipe, OPAL_EV_READ, pipe_closed, pipe_handler); opal_event_add(pipe_handler, NULL); } /* If I have a parent, then save his contact info so * any messages we send can flow thru him. */ mca_base_param_reg_string_name("orte", "parent_uri", "URI for the parent if tree launch is enabled.", true, false, NULL, &rml_uri); if (NULL != rml_uri) { orte_process_name_t parent; /* set the contact info into the hash table */ if (ORTE_SUCCESS != (ret = orte_rml.set_contact_info(rml_uri))) { ORTE_ERROR_LOG(ret); free(rml_uri); goto DONE; } ret = orte_rml_base_parse_uris(rml_uri, &parent, NULL ); if( ORTE_SUCCESS != ret ) { ORTE_ERROR_LOG(ret); free(rml_uri); goto DONE; } free(rml_uri); /* tell the routed module that we have a path * back to the HNP */ if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) { ORTE_ERROR_LOG(ret); goto DONE; } /* set the lifeline to point to our parent so that we * can handle the situation if that lifeline goes away */ if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) { ORTE_ERROR_LOG(ret); goto DONE; } } /* if we are not the HNP...the only time we will be an HNP * is if we are launched by a singleton to provide support * for it */ if (!ORTE_PROC_IS_HNP) { /* send the information to the orted report-back point - this function * will process the data, but also counts the number of * orteds that reported back so the launch procedure can continue. * We need to do this at the last possible second as the HNP * can turn right around and begin issuing orders to us */ buffer = OBJ_NEW(opal_buffer_t); /* insert our name for rollup purposes */ if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } /* for now, always include our contact info, even if we are using * static ports. Eventually, this will be removed */ rml_uri = orte_rml.get_contact_info(); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } /* include our node name */ opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING); #if OPAL_HAVE_HWLOC /* add the local topology */ if (NULL != opal_hwloc_topology && (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) { if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { ORTE_ERROR_LOG(ret); } } #endif if ((orte_static_ports || orte_use_common_port) && !orted_globals.tree_spawn) { /* use the rollup collective to send our data to the HNP * so we minimize the HNP bottleneck */ orte_grpcomm_collective_t *coll; coll = OBJ_NEW(orte_grpcomm_collective_t); /* get the list of contributors we need from the routed module */ orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_PEERS, coll); /* add the collective to our list */ opal_list_append(&orte_grpcomm_base.active_colls, &coll->super); /* send the buffer to ourselves to start the collective */ if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, buffer, ORTE_RML_TAG_ROLLUP, 0, rml_cbfunc, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } else { /* send directly to the HNP's callback */ if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_ORTED_CALLBACK, 0, rml_cbfunc, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } } if (orte_debug_daemons_flag) { opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } /* loop the event lib until an exit event is detected */ while (orte_event_base_active) { opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); } /* ensure all local procs are dead */ orte_odls.kill_local_procs(NULL); DONE: /* update the exit status, in case it wasn't done */ ORTE_UPDATE_EXIT_STATUS(orte_exit_status); /* cleanup and leave */ orte_finalize(); if (orte_debug_flag) { fprintf(stderr, "exiting with status %d\n", orte_exit_status); } exit(orte_exit_status); }
int main(int argc, char *argv[]) { int ret = 0; bool want_help = false; bool cmd_error = false; bool acted = false; bool want_all = false; char **app_env = NULL, **global_env = NULL; int i, len; char *str; /* Initialize the argv parsing handle */ if (OMPI_SUCCESS != opal_init_util(&argc, &argv)) { orte_show_help("help-ompi_info.txt", "lib-call-fail", true, "opal_init_util", __FILE__, __LINE__, NULL); exit(ret); } ompi_info_cmd_line = OBJ_NEW(opal_cmd_line_t); if (NULL == ompi_info_cmd_line) { ret = errno; orte_show_help("help-ompi_info.txt", "lib-call-fail", true, "opal_cmd_line_create", __FILE__, __LINE__, NULL); opal_finalize_util(); exit(ret); } opal_cmd_line_make_opt3(ompi_info_cmd_line, 'v', NULL, "version", 2, "Show version of Open MPI or a component. The first parameter can be the keywords \"ompi\" or \"all\", a framework name (indicating all components in a framework), or a framework:component string (indicating a specific component). The second parameter can be one of: full, major, minor, release, greek, svn."); opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "param", 2, "Show MCA parameters. The first parameter is the framework (or the keyword \"all\"); the second parameter is the specific component name (or the keyword \"all\")."); opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "internal", 0, "Show internal MCA parameters (not meant to be modified by users)"); opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "path", 1, "Show paths that Open MPI was configured with. Accepts the following parameters: prefix, bindir, libdir, incdir, mandir, pkglibdir, sysconfdir"); opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "arch", 0, "Show architecture Open MPI was compiled on"); opal_cmd_line_make_opt3(ompi_info_cmd_line, 'c', NULL, "config", 0, "Show configuration options"); opal_cmd_line_make_opt3(ompi_info_cmd_line, 'h', NULL, "help", 0, "Show this help message"); opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "ompi_info_pretty", 0, "When used in conjunction with other parameters, the output is displayed in 'ompi_info_prettyprint' format (default)"); opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "parsable", 0, "When used in conjunction with other parameters, the output is displayed in a machine-parsable format"); opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "parseable", 0, "Synonym for --parsable"); opal_cmd_line_make_opt3(ompi_info_cmd_line, '\0', NULL, "hostname", 0, "Show the hostname that Open MPI was configured " "and built on"); opal_cmd_line_make_opt3(ompi_info_cmd_line, 'a', NULL, "all", 0, "Show all configuration options and MCA parameters"); /* Call some useless functions in order to guarantee to link in some * global variables. Only check the return value so that the * compiler doesn't optimize out the useless function. */ if (OMPI_SUCCESS != ompi_comm_link_function()) { /* Stop .. or I'll say stop again! */ ++ret; } else { --ret; } /* set our threading level */ opal_set_using_threads(false); /* Get MCA parameters, if any */ if( OMPI_SUCCESS != mca_base_open() ) { orte_show_help("help-ompi_info.txt", "lib-call-fail", true, "mca_base_open", __FILE__, __LINE__ ); OBJ_RELEASE(ompi_info_cmd_line); opal_finalize_util(); exit(1); } mca_base_cmd_line_setup(ompi_info_cmd_line); /* Do the parsing */ if (OMPI_SUCCESS != opal_cmd_line_parse(ompi_info_cmd_line, false, argc, argv)) { cmd_error = true; } if (!cmd_error && (opal_cmd_line_is_taken(ompi_info_cmd_line, "help") || opal_cmd_line_is_taken(ompi_info_cmd_line, "h"))) { want_help = true; } if (cmd_error || want_help) { char *usage = opal_cmd_line_get_usage_msg(ompi_info_cmd_line); orte_show_help("help-ompi_info.txt", "usage", true, usage); free(usage); mca_base_close(); OBJ_RELEASE(ompi_info_cmd_line); opal_finalize_util(); exit(cmd_error ? 1 : 0); } mca_base_cmd_line_process_args(ompi_info_cmd_line, &app_env, &global_env); /* putenv() all the stuff that we got back from env (in case the * user specified some --mca params on the command line). This * creates a memory leak, but that's unfortunately how putenv() * works. :-( */ len = opal_argv_count(app_env); for (i = 0; i < len; ++i) { putenv(app_env[i]); } len = opal_argv_count(global_env); for (i = 0; i < len; ++i) { putenv(global_env[i]); } /* setup the mca_types array */ OBJ_CONSTRUCT(&mca_types, opal_pointer_array_t); opal_pointer_array_init(&mca_types, 256, INT_MAX, 128); opal_pointer_array_add(&mca_types, "mca"); opal_pointer_array_add(&mca_types, "mpi"); opal_pointer_array_add(&mca_types, "orte"); opal_pointer_array_add(&mca_types, "opal"); opal_pointer_array_add(&mca_types, "filter"); opal_pointer_array_add(&mca_types, "backtrace"); opal_pointer_array_add(&mca_types, "memchecker"); opal_pointer_array_add(&mca_types, "memory"); opal_pointer_array_add(&mca_types, "paffinity"); opal_pointer_array_add(&mca_types, "carto"); opal_pointer_array_add(&mca_types, "shmem"); opal_pointer_array_add(&mca_types, "maffinity"); opal_pointer_array_add(&mca_types, "timer"); opal_pointer_array_add(&mca_types, "installdirs"); opal_pointer_array_add(&mca_types, "sysinfo"); opal_pointer_array_add(&mca_types, "hwloc"); #if OPAL_ENABLE_FT_CR == 1 opal_pointer_array_add(&mca_types, "crs"); #endif opal_pointer_array_add(&mca_types, "dpm"); opal_pointer_array_add(&mca_types, "pubsub"); opal_pointer_array_add(&mca_types, "allocator"); opal_pointer_array_add(&mca_types, "coll"); opal_pointer_array_add(&mca_types, "io"); opal_pointer_array_add(&mca_types, "mpool"); opal_pointer_array_add(&mca_types, "pml"); opal_pointer_array_add(&mca_types, "bml"); opal_pointer_array_add(&mca_types, "rcache"); opal_pointer_array_add(&mca_types, "btl"); opal_pointer_array_add(&mca_types, "mtl"); opal_pointer_array_add(&mca_types, "topo"); opal_pointer_array_add(&mca_types, "osc"); opal_pointer_array_add(&mca_types, "op"); opal_pointer_array_add(&mca_types, "common"); #if OPAL_ENABLE_FT_CR == 1 opal_pointer_array_add(&mca_types, "crcp"); #endif #if !ORTE_DISABLE_FULL_SUPPORT opal_pointer_array_add(&mca_types, "iof"); opal_pointer_array_add(&mca_types, "oob"); opal_pointer_array_add(&mca_types, "odls"); opal_pointer_array_add(&mca_types, "ras"); opal_pointer_array_add(&mca_types, "rmaps"); opal_pointer_array_add(&mca_types, "rml"); opal_pointer_array_add(&mca_types, "routed"); opal_pointer_array_add(&mca_types, "plm"); #if OPAL_ENABLE_FT_CR == 1 opal_pointer_array_add(&mca_types, "snapc"); #endif opal_pointer_array_add(&mca_types, "filem"); #endif /* these are always included */ opal_pointer_array_add(&mca_types, "errmgr"); opal_pointer_array_add(&mca_types, "ess"); opal_pointer_array_add(&mca_types, "grpcomm"); opal_pointer_array_add(&mca_types, "notifier"); /* Execute the desired action(s) */ if (opal_cmd_line_is_taken(ompi_info_cmd_line, "ompi_info_pretty")) { ompi_info_pretty = true; } else if (opal_cmd_line_is_taken(ompi_info_cmd_line, "parsable") || opal_cmd_line_is_taken(ompi_info_cmd_line, "parseable")) { ompi_info_pretty = false; } want_all = opal_cmd_line_is_taken(ompi_info_cmd_line, "all"); if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "version")) { ompi_info_do_version(want_all, ompi_info_cmd_line); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "path")) { ompi_info_do_path(want_all, ompi_info_cmd_line); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "arch")) { ompi_info_do_arch(); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "hostname")) { ompi_info_do_hostname(); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "config")) { ompi_info_do_config(true); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "param")) { ompi_info_do_params(want_all, opal_cmd_line_is_taken(ompi_info_cmd_line, "internal")); acted = true; } /* If no command line args are specified, show default set */ if (!acted) { ompi_info_show_ompi_version(ompi_info_ver_full); ompi_info_show_path(ompi_info_path_prefix, opal_install_dirs.prefix); ompi_info_do_arch(); ompi_info_do_hostname(); ompi_info_do_config(false); ompi_info_open_components(); for (i = 0; i < mca_types.size; ++i) { if (NULL == (str = (char*)opal_pointer_array_get_item(&mca_types, i))) { continue; } if (0 != strcmp("mpi", str)) { ompi_info_show_component_version(str, ompi_info_component_all, ompi_info_ver_full, ompi_info_type_all); } } } /* All done */ if (NULL != app_env) { opal_argv_free(app_env); } if (NULL != global_env) { opal_argv_free(global_env); } ompi_info_close_components(); OBJ_RELEASE(ompi_info_cmd_line); OBJ_DESTRUCT(&mca_types); mca_base_close(); opal_finalize_util(); return 0; }
int orte_daemon(int argc, char *argv[]) { int ret = 0; opal_cmd_line_t *cmd_line = NULL; char *rml_uri; int i; opal_buffer_t *buffer; char hostname[100]; #if OPAL_ENABLE_FT_CR == 1 char *tmp_env_var = NULL; #endif /* initialize the globals */ memset(&orted_globals, 0, sizeof(orted_globals)); /* initialize the singleton died pipe to an illegal value so we can detect it was set */ orted_globals.singleton_died_pipe = -1; /* init the failure orted vpid to an invalid value */ orted_globals.fail = ORTE_VPID_INVALID; /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) { OBJ_RELEASE(cmd_line); exit(1); } mca_base_cmd_line_setup(cmd_line); if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, argc, argv))) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args); free(args); OBJ_RELEASE(cmd_line); return ret; } /* * Since this process can now handle MCA/GMCA parameters, make sure to * process them. */ mca_base_cmd_line_process_args(cmd_line, &environ, &environ); /* Ensure that enough of OPAL is setup for us to be able to run */ /* * NOTE: (JJH) * We need to allow 'mca_base_cmd_line_process_args()' to process command * line arguments *before* calling opal_init_util() since the command * line could contain MCA parameters that affect the way opal_init_util() * functions. AMCA parameters are one such option normally received on the * command line that affect the way opal_init_util() behaves. * It is "safe" to call mca_base_cmd_line_process_args() before * opal_init_util() since mca_base_cmd_line_process_args() does *not* * depend upon opal_init_util() functionality. */ if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) { fprintf(stderr, "OPAL failed to initialize -- orted aborting\n"); exit(1); } /* save the environment for launch purposes. This MUST be * done so that we can pass it to any local procs we * spawn - otherwise, those local procs won't see any * non-MCA envars that were set in the enviro when the * orted was executed - e.g., by .csh */ orte_launch_environ = opal_argv_copy(environ); /* purge any ess flag set in the environ when we were launched */ opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ); /* if orte_daemon_debug is set, let someone know we are alive right * away just in case we have a problem along the way */ if (orted_globals.debug) { gethostname(hostname, 100); fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname); } /* check for help request */ if (orted_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); orte_show_help("help-orted.txt", "orted:usage", false, argv[0], args); free(args); return 1; } #if defined(HAVE_SETSID) /* see if we were directed to separate from current session */ if (orted_globals.set_sid) { setsid(); } #endif /* see if they want us to spin until they can connect a debugger to us */ i=0; while (orted_spin_flag) { i++; if (1000 < i) i=0; } #if OPAL_ENABLE_FT_CR == 1 /* Mark as a tool program */ (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var); opal_setenv(tmp_env_var, "1", true, &environ); free(tmp_env_var); #endif /* if mapreduce set, flag it */ if (orted_globals.mapreduce) { orte_map_reduce = true; } /* detach from controlling terminal * otherwise, remain attached so output can get to us */ if(!orte_debug_flag && !orte_debug_daemons_flag && orted_globals.daemonize) { opal_daemon_init(NULL); } /* Set the flag telling OpenRTE that I am NOT a * singleton, but am "infrastructure" - prevents setting * up incorrect infrastructure that only a singleton would * require. */ if (orted_globals.hnp) { if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) { ORTE_ERROR_LOG(ret); return ret; } } else { if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) { ORTE_ERROR_LOG(ret); return ret; } } /* finalize the OPAL utils. As they are opened again from orte_init->opal_init * we continue to have a reference count on them. So we have to finalize them twice... */ opal_finalize_util(); #if OPAL_HAVE_HWLOC /* bind ourselves if so directed */ if (NULL != orte_daemon_cores) { char **cores=NULL, tmp[128]; hwloc_obj_t pu; hwloc_cpuset_t ours, pucpus, res; int core; /* could be a collection of comma-delimited ranges, so * use our handy utility to parse it */ orte_util_parse_range_options(orte_daemon_cores, &cores); if (NULL != cores) { ours = hwloc_bitmap_alloc(); hwloc_bitmap_zero(ours); pucpus = hwloc_bitmap_alloc(); res = hwloc_bitmap_alloc(); for (i=0; NULL != cores[i]; i++) { core = strtoul(cores[i], NULL, 10); if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) { /* turn off the show help forwarding as we won't * be able to cycle the event library to send */ orte_show_help_finalize(); /* the message will now come out locally */ orte_show_help("help-orted.txt", "orted:cannot-bind", true, orte_process_info.nodename, orte_daemon_cores); ret = ORTE_ERR_NOT_SUPPORTED; goto DONE; } hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset); hwloc_bitmap_or(res, ours, pucpus); hwloc_bitmap_copy(ours, res); } /* if the result is all zeros, then don't bind */ if (!hwloc_bitmap_iszero(ours)) { (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0); if (opal_hwloc_report_bindings) { opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours); opal_output(0, "Daemon %s is bound to cores %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); } } /* cleanup */ hwloc_bitmap_free(ours); hwloc_bitmap_free(pucpus); hwloc_bitmap_free(res); opal_argv_free(cores); } } #endif if ((int)ORTE_VPID_INVALID != orted_globals.fail) { orted_globals.abort=false; /* some vpid was ordered to fail. The value can be positive * or negative, depending upon the desired method for failure, * so need to check both here */ if (0 > orted_globals.fail) { orted_globals.fail = -1*orted_globals.fail; orted_globals.abort = true; } /* are we the specified vpid? */ if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) { /* if the user specified we delay, then setup a timer * and have it kill us */ if (0 < orted_globals.fail_delay) { ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI); } else { opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orted_globals.abort ? "abort" : "abnormal termination"); /* do -not- call finalize as this will send a message to the HNP * indicating clean termination! Instead, just forcibly cleanup * the local session_dir tree and exit */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* if we were ordered to abort, do so */ if (orted_globals.abort) { abort(); } /* otherwise, return with non-zero status */ ret = ORTE_ERROR_DEFAULT_EXIT_CODE; goto DONE; } } } /* insert our contact info into our process_info struct so we * have it for later use and set the local daemon field to our name */ orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; /* if I am also the hnp, then update that contact info field too */ if (ORTE_PROC_IS_HNP) { orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; } /* setup the primary daemon command receive function */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); /* output a message indicating we are alive, our name, and our pid * for debugging purposes */ if (orte_debug_daemons_flag) { fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid, orte_process_info.nodename); } /* We actually do *not* want the orted to voluntarily yield() the processor more than necessary. The orted already blocks when it is doing nothing, so it doesn't use any more CPU cycles than it should; but when it *is* doing something, we do not want it to be unnecessarily delayed because it voluntarily yielded the processor in the middle of its work. For example: when a message arrives at the orted, we want the OS to wake up the orted in a timely fashion (which most OS's seem good about doing) and then we want the orted to process the message as fast as possible. If the orted yields and lets aggressive MPI applications get the processor back, it may be a long time before the OS schedules the orted to run again (particularly if there is no IO event to wake it up). Hence, routed OOB messages (for example) may be significantly delayed before being delivered to MPI processes, which can be problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); /* Change the default behavior of libevent such that we want to continually block rather than blocking for the default timeout and then looping around the progress engine again. There should be nothing in the orted that cannot block in libevent until "something" happens (i.e., there's no need to keep cycling through progress because the only things that should happen will happen in libevent). This is a minor optimization, but what the heck... :-) */ opal_progress_set_event_flag(OPAL_EVLOOP_ONCE); /* if requested, report my uri to the indicated pipe */ if (orted_globals.uri_pipe > 0) { orte_job_t *jdata; orte_proc_t *proc; orte_node_t *node; orte_app_context_t *app; char *tmp, *nptr, *sysinfo; int32_t ljob; /* setup the singleton's job */ jdata = OBJ_NEW(orte_job_t); orte_plm_base_create_jobid(jdata); ljob = ORTE_LOCAL_JOBID(jdata->jobid); opal_pointer_array_set_item(orte_job_data, ljob, jdata); /* must create a map for it (even though it has no * info in it) so that the job info will be picked * up in subsequent pidmaps or other daemons won't * know how to route */ jdata->map = OBJ_NEW(orte_job_map_t); /* setup an app_context for the singleton */ app = OBJ_NEW(orte_app_context_t); app->app = strdup("singleton"); app->num_procs = 1; opal_pointer_array_add(jdata->apps, app); /* setup a proc object for the singleton - since we * -must- be the HNP, and therefore we stored our * node on the global node pool, and since the singleton * -must- be on the same node as us, indicate that */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = jdata->jobid; proc->name.vpid = 0; ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE); proc->state = ORTE_PROC_STATE_RUNNING; proc->app_idx = 0; /* obviously, it is on my node */ node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); proc->node = node; OBJ_RETAIN(node); /* keep accounting straight */ opal_pointer_array_add(jdata->procs, proc); jdata->num_procs = 1; /* and it obviously is on the node */ OBJ_RETAIN(proc); opal_pointer_array_add(node->procs, proc); node->num_procs++; /* and obviously it is one of my local procs */ OBJ_RETAIN(proc); opal_pointer_array_add(orte_local_children, proc); jdata->num_local_procs = 1; /* set the trivial */ proc->local_rank = 0; proc->node_rank = 0; proc->app_rank = 0; proc->state = ORTE_PROC_STATE_RUNNING; proc->app_idx = 0; ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL); /* create a string that contains our uri + sysinfo + PMIx server URI */ orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model); asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, pmix_server_uri); free(sysinfo); /* pass that info to the singleton */ write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */ /* cleanup */ free(tmp); /* since a singleton spawned us, we need to harvest * any MCA params from the local environment so * we can pass them along to any subsequent daemons * we may start as the result of a comm_spawn */ for (i=0; NULL != environ[i]; i++) { if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) { /* make a copy to manipulate */ tmp = strdup(environ[i]); /* find the equal sign */ nptr = strchr(tmp, '='); *nptr = '\0'; nptr++; /* add the mca param to the orted cmd line */ opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID); opal_argv_append_nosize(&orted_cmd_line, &tmp[9]); opal_argv_append_nosize(&orted_cmd_line, nptr); free(tmp); } } } /* if we were given a pipe to monitor for singleton termination, set that up */ if (orted_globals.singleton_died_pipe > 0) { /* register shutdown handler */ pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t)); opal_event_set(orte_event_base, pipe_handler, orted_globals.singleton_died_pipe, OPAL_EV_READ, pipe_closed, pipe_handler); opal_event_add(pipe_handler, NULL); } /* If I have a parent, then save his contact info so * any messages we send can flow thru him. */ orte_parent_uri = NULL; (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri", "URI for the parent if tree launch is enabled.", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_INTERNAL, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_CONSTANT, &orte_parent_uri); if (NULL != orte_parent_uri) { orte_process_name_t parent; /* set the contact info into the hash table */ orte_rml.set_contact_info(orte_parent_uri); ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL); if (ORTE_SUCCESS != ret) { ORTE_ERROR_LOG(ret); free (orte_parent_uri); orte_parent_uri = NULL; goto DONE; } /* don't need this value anymore */ free(orte_parent_uri); orte_parent_uri = NULL; /* tell the routed module that we have a path * back to the HNP */ if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) { ORTE_ERROR_LOG(ret); goto DONE; } /* set the lifeline to point to our parent so that we * can handle the situation if that lifeline goes away */ if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) { ORTE_ERROR_LOG(ret); goto DONE; } } /* if we are not the HNP...the only time we will be an HNP * is if we are launched by a singleton to provide support * for it */ if (!ORTE_PROC_IS_HNP) { /* send the information to the orted report-back point - this function * will process the data, but also counts the number of * orteds that reported back so the launch procedure can continue. * We need to do this at the last possible second as the HNP * can turn right around and begin issuing orders to us */ buffer = OBJ_NEW(opal_buffer_t); /* insert our name for rollup purposes */ if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } /* for now, always include our contact info, even if we are using * static ports. Eventually, this will be removed */ rml_uri = orte_rml.get_contact_info(); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } /* include our node name */ opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING); /* if requested, include any non-loopback aliases for this node */ if (orte_retain_aliases) { char **aliases=NULL; uint8_t naliases, ni; char hostname[ORTE_MAX_HOSTNAME_SIZE]; /* if we stripped the prefix or removed the fqdn, * include full hostname as an alias */ gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE); if (strlen(orte_process_info.nodename) < strlen(hostname)) { opal_argv_append_nosize(&aliases, hostname); } opal_ifgetaliases(&aliases); naliases = opal_argv_count(aliases); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &naliases, 1, OPAL_UINT8))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } for (ni=0; ni < naliases; ni++) { if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &aliases[ni], 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } opal_argv_free(aliases); } #if OPAL_HAVE_HWLOC { char *coprocessors; /* add the local topology */ if (NULL != opal_hwloc_topology && (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) { if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) { ORTE_ERROR_LOG(ret); } } /* detect and add any coprocessors */ coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } /* see if I am on a coprocessor */ coprocessors = opal_hwloc_base_check_on_coprocessor(); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); } } #endif /* send to the HNP's callback - will be routed if routes are available */ if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_ORTED_CALLBACK, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } /* if we are tree-spawning, then we need to capture the MCA params * from our cmd line so we can pass them along to the daemons we spawn - * otherwise, only the first layer of daemons will ever see them */ if (orted_globals.tree_spawn) { int j, k; bool ignore; char *no_keep[] = { "orte_hnp_uri", "orte_ess_jobid", "orte_ess_vpid", "orte_ess_num_procs", "orte_parent_uri", "mca_base_env_list", NULL }; for (i=0; i < argc; i++) { if (0 == strcmp("-"OPAL_MCA_CMD_LINE_ID, argv[i]) || 0 == strcmp("--"OPAL_MCA_CMD_LINE_ID, argv[i]) ) { ignore = false; /* see if this is something we cannot pass along */ for (k=0; NULL != no_keep[k]; k++) { if (0 == strcmp(no_keep[k], argv[i+1])) { ignore = true; break; } } if (!ignore) { /* see if this is already present so we at least can * avoid growing the cmd line with duplicates */ if (NULL != orted_cmd_line) { for (j=0; NULL != orted_cmd_line[j]; j++) { if (0 == strcmp(argv[i+1], orted_cmd_line[j])) { /* already here - ignore it */ ignore = true; break; } } } if (!ignore) { opal_argv_append_nosize(&orted_cmd_line, argv[i]); opal_argv_append_nosize(&orted_cmd_line, argv[i+1]); opal_argv_append_nosize(&orted_cmd_line, argv[i+2]); } } i += 2; } } } if (orte_debug_daemons_flag) { opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } ret = ORTE_SUCCESS; /* loop the event lib until an exit event is detected */ while (orte_event_base_active) { opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); } /* ensure all local procs are dead */ orte_odls.kill_local_procs(NULL); DONE: /* update the exit status, in case it wasn't done */ ORTE_UPDATE_EXIT_STATUS(ret); /* cleanup and leave */ orte_finalize(); if (orte_debug_flag) { fprintf(stderr, "exiting with status %d\n", orte_exit_status); } exit(orte_exit_status); }
int main( int argc, char* argv[] ) { ddt_segment_t* segments; int *send_buffer, *recv_buffer; int i, seg_count, errors; int show_only_first_error = 1; ompi_datatype_t* datatype = MPI_DATATYPE_NULL; #define NELT (300) send_buffer = malloc(NELT*sizeof(int)); recv_buffer = malloc(NELT*sizeof(int)); for (i = 0; i < NELT; ++i) { send_buffer[i] = i; recv_buffer[i] = 0xdeadbeef; } opal_init_util (NULL, NULL); ompi_datatype_init(); ompi_datatype_create_vector(NELT/2, 1, 2, MPI_INT, &datatype); ompi_datatype_commit(&datatype); #if (OPAL_ENABLE_DEBUG == 1) && (OPAL_C_HAVE_VISIBILITY == 0) opal_unpack_debug = false; opal_pack_debug = false; opal_position_debug = false; #endif /* OPAL_ENABLE_DEBUG */ create_segments( datatype, 1, fragment_size, &segments, &seg_count ); /* shuffle the segments */ shuffle_segments( segments, seg_count ); /* pack the data */ pack_segments( datatype, 1, fragment_size, segments, seg_count, send_buffer ); /* unpack the data back in the user space (recv buffer) */ unpack_segments( datatype, 1, fragment_size, segments, seg_count, recv_buffer ); /* And now check the data */ for( errors = i = 0; i < NELT; i++ ) { int expected = ((i % 2) ? (int)0xdeadbeef : i); if (recv_buffer[i] != expected) { if( (show_only_first_error && (0 == errors)) || !show_only_first_error ) { printf("error at index %4d: 0x%08x != 0x%08x\n", i, recv_buffer[i], expected); } errors++; } } printf( "Found %d errors\n", errors ); free(send_buffer); free(recv_buffer); for( i = 0; i < seg_count; i++ ) { free( segments[i].buffer ); } free(segments); ompi_datatype_finalize(); opal_finalize_util (); return (0 == errors ? 0 : -1); }
int main(int argc, char *argv[]) { int ret = 0; bool acted = false; bool want_all = false; char **app_env = NULL, **global_env = NULL; int i; opal_cmd_line_t *ompi_info_cmd_line; opal_pointer_array_t mca_types; opal_pointer_array_t component_map; opal_info_component_map_t *map; /* protect against problems if someone passes us thru a pipe * and then abnormally terminates the pipe early */ signal(SIGPIPE, SIG_IGN); /* Initialize the argv parsing handle */ if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) { opal_show_help("help-opal_info.txt", "lib-call-fail", true, "opal_init_util", __FILE__, __LINE__, NULL); exit(ret); } ompi_info_cmd_line = OBJ_NEW(opal_cmd_line_t); if (NULL == ompi_info_cmd_line) { ret = errno; opal_show_help("help-opal_info.txt", "lib-call-fail", true, "opal_cmd_line_create", __FILE__, __LINE__, NULL); exit(ret); } /* initialize the command line, parse it, and return the directives * telling us what the user wants output */ if (OPAL_SUCCESS != (ret = opal_info_init(argc, argv, ompi_info_cmd_line))) { exit(ret); } if (opal_cmd_line_is_taken(ompi_info_cmd_line, "version")) { fprintf(stdout, "Open MPI v%s\n\n%s\n", OPAL_VERSION, PACKAGE_BUGREPORT); exit(0); } /* setup the mca_types array */ OBJ_CONSTRUCT(&mca_types, opal_pointer_array_t); opal_pointer_array_init(&mca_types, 256, INT_MAX, 128); /* add in the opal frameworks */ opal_info_register_types(&mca_types); #if OMPI_RTE_ORTE /* add in the orte frameworks */ orte_info_register_types(&mca_types); #endif ompi_info_register_types(&mca_types); /* init the component map */ OBJ_CONSTRUCT(&component_map, opal_pointer_array_t); opal_pointer_array_init(&component_map, 256, INT_MAX, 128); /* Register OMPI's params */ if (OMPI_SUCCESS != (ret = ompi_info_register_framework_params(&component_map))) { if (OMPI_ERR_BAD_PARAM == ret) { /* output what we got */ opal_info_do_params(true, opal_cmd_line_is_taken(ompi_info_cmd_line, "internal"), &mca_types, &component_map, NULL); } exit(1); } /* Execute the desired action(s) */ want_all = opal_cmd_line_is_taken(ompi_info_cmd_line, "all"); if (want_all) { opal_info_out("Package", "package", OPAL_PACKAGE_STRING); ompi_info_show_ompi_version(opal_info_ver_full); } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "path")) { opal_info_do_path(want_all, ompi_info_cmd_line); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "arch")) { opal_info_do_arch(); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "hostname")) { opal_info_do_hostname(); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "config")) { ompi_info_do_config(true); acted = true; } if (want_all || opal_cmd_line_is_taken(ompi_info_cmd_line, "param") || opal_cmd_line_is_taken(ompi_info_cmd_line, "params")) { opal_info_do_params(want_all, opal_cmd_line_is_taken(ompi_info_cmd_line, "internal"), &mca_types, &component_map, ompi_info_cmd_line); acted = true; } if (opal_cmd_line_is_taken(ompi_info_cmd_line, "type")) { opal_info_do_type(ompi_info_cmd_line); acted = true; } /* If no command line args are specified, show default set */ if (!acted) { opal_info_out("Package", "package", OPAL_PACKAGE_STRING); ompi_info_show_ompi_version(opal_info_ver_full); opal_info_show_path(opal_info_path_prefix, opal_install_dirs.prefix); opal_info_do_arch(); opal_info_do_hostname(); ompi_info_do_config(false); opal_info_show_component_version(&mca_types, &component_map, opal_info_type_all, opal_info_component_all, opal_info_ver_full, opal_info_ver_all); } /* All done */ if (NULL != app_env) { opal_argv_free(app_env); } if (NULL != global_env) { opal_argv_free(global_env); } ompi_info_close_components(); OBJ_RELEASE(ompi_info_cmd_line); OBJ_DESTRUCT(&mca_types); for (i=0; i < component_map.size; i++) { if (NULL != (map = (opal_info_component_map_t*)opal_pointer_array_get_item(&component_map, i))) { OBJ_RELEASE(map); } } OBJ_DESTRUCT(&component_map); opal_info_finalize(); /* Put our own call to opal_finalize_util() here because we called it up above (and it refcounts) */ opal_finalize_util(); return 0; }
int main(int argc, char **argv) { /* local variables */ opal_list_t list, x; size_t indx,i,list_size, tmp_size_1, tmp_size_2,size_elements; int error_cnt, rc; test_data_t *elements, *ele; opal_list_item_t *item; rc = opal_init_util(&argc, &argv); test_verify_int(OPAL_SUCCESS, rc); if (OPAL_SUCCESS != rc) { test_finalize(); exit(1); } test_init("opal_list_t"); /* initialize list */ OBJ_CONSTRUCT(&list, opal_list_t); OBJ_CONSTRUCT(&x, opal_list_t); /* check length of list */ list_size=opal_list_get_size(&list); if( 0 == list_size ) { test_success(); } else { test_failure(" opal_list_get_size"); } /* check for empty */ if (opal_list_is_empty(&list)) { test_success(); } else { test_failure(" opal_list_is_empty(empty list)"); } /* create test elements */ size_elements=4; elements=(test_data_t *)malloc(sizeof(test_data_t)*size_elements); assert(elements); for(i=0 ; i < size_elements ; i++) { OBJ_CONSTRUCT(elements + i, test_data_t); (elements+i)->data=i; } /* populate list */ for(i=0 ; i < size_elements ; i++) { opal_list_append(&list,(opal_list_item_t *)(elements+i)); } list_size=opal_list_get_size(&list); if( list_size == size_elements ) { test_success(); } else { test_failure(" populating list"); } /* checking for empty on non-empty list */ if (!opal_list_is_empty(&list)) { test_success(); } else { test_failure(" opal_list_is_empty(non-empty list)"); } /* check that list is ordered as expected */ i=0; error_cnt=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { if( ele->data != i ) error_cnt++; i++; } if( 0 == error_cnt ) { test_success(); } else { test_failure(" error in list order "); } /* check opal_list_get_first */ ele = (test_data_t *)NULL; ele = (test_data_t *) opal_list_get_first(&list); assert(ele); if( 0 == ele->data ) { test_success(); } else { test_failure(" error in opal_list_get_first"); } i=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { i++; } if( size_elements == i ) { test_success(); } else { test_failure(" error in opal_list_get_first - list size changed "); } /* check opal_list_get_last */ ele = (test_data_t *)NULL; ele = (test_data_t *) opal_list_get_last(&list); assert(ele); if( (size_elements-1) == ele->data ) { test_success(); } else { test_failure(" error in opal_list_get_last"); } i=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { i++; } if( size_elements == i ) { test_success(); } else { test_failure(" error in opal_list_get_first - list size changed "); } /* check opal_list_remove_first */ ele = (test_data_t *)NULL; ele = (test_data_t *) opal_list_remove_first(&list); assert(ele); if( 0 == ele->data ) { test_success(); } else { test_failure(" error in opal_list_remove_first"); } i=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { i++; } if( (size_elements-1) == i ) { test_success(); } else { test_failure(" error in opal_list_remove_first - list size changed "); } /* test opal_list_prepend */ opal_list_prepend(&list,(opal_list_item_t *)elements); ele = (test_data_t *)NULL; ele = (test_data_t *) opal_list_get_first(&list); assert(ele); if( 0 == ele->data ) { test_success(); } else { test_failure(" error in opal_list_prepend"); } i=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { i++; } if( size_elements == i ) { test_success(); } else { test_failure(" error in opal_list_prepend - list size changed "); } /* check opal_list_remove_last */ ele = (test_data_t *)NULL; ele = (test_data_t *) opal_list_remove_last(&list); assert(ele); if( (size_elements-1) == ele->data ) { test_success(); } else { test_failure(" error in opal_list_remove_last"); } i=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { i++; } if( (size_elements-1) == i ) { test_success(); } else { test_failure(" error in opal_list_remove_last - list size changed "); } /* test opal_list_append */ opal_list_append(&list,(opal_list_item_t *)(elements+size_elements-1)); ele = (test_data_t *)NULL; ele = (test_data_t *) opal_list_get_last(&list); assert(ele); if( (size_elements-1) == ele->data ) { test_success(); } else { test_failure(" error in opal_list_append"); } i=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { i++; } if( size_elements == i ) { test_success(); } else { test_failure(" error in opal_list_append - list size changed "); } /* remove element from list */ indx=size_elements/2; if( 0 == indx ) indx=1; assert(2 <= size_elements); ele = (test_data_t *)NULL; ele = (test_data_t *) opal_list_remove_item(&list,(opal_list_item_t *)(elements+indx)); assert(ele); if( (indx-1) == ele->data ) { test_success(); } else { test_failure(" error in opal_list_remove - previous"); } ele=(test_data_t *)(((opal_list_item_t *)ele)->opal_list_next); if( (indx+1) == ele->data ) { test_success(); } else { test_failure(" error in opal_list_remove - next"); } i=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { i++; } if( (size_elements-1) == i ) { test_success(); } else { test_failure(" error in opal_list_remove - list size changed incorrectly"); } /* test the insert function */ i=opal_list_insert(&list,(opal_list_item_t *)(elements+indx),indx); if( 1 == i ) { test_success(); } else { test_failure(" error in opal_list_remove_item \n"); } i=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { i++; } if( size_elements == i ) { test_success(); } else { test_failure(" error in opal_list_insert - incorrect list length"); } i=0; error_cnt=0; for(ele = (test_data_t *) opal_list_get_first(&list); ele != (test_data_t *) opal_list_get_end(&list); ele = (test_data_t *) ((opal_list_item_t *)ele)->opal_list_next) { if( ele->data != i ) error_cnt++; i++; } if( 0 == error_cnt ) { test_success(); } else { test_failure(" error in list order - opal_list_remove_item "); } /* test the splice and join functions */ list_size = opal_list_get_size(&list); for (i = 0, item = opal_list_get_first(&list) ; i < list_size / 2 ; ++i, item = opal_list_get_next(item)) { } opal_list_splice(&x, opal_list_get_end(&x), &list, item, opal_list_get_end(&list)); tmp_size_1 = opal_list_get_size(&list); tmp_size_2 = opal_list_get_size(&x); if (tmp_size_1 != i) { test_failure(" error in splice (size of list)"); } else if (tmp_size_2 != list_size - tmp_size_1) { test_failure(" error in splice (size of x)"); } else { test_success(); } opal_list_join(&list, opal_list_get_end(&list), &x); tmp_size_1 = opal_list_get_size(&list); tmp_size_2 = opal_list_get_size(&x); if (tmp_size_1 != list_size) { test_failure(" error in join (size of list)"); } else if (tmp_size_2 != 0) { test_failure(" error in join (size of x)"); } else { test_success(); } if (NULL != elements) free(elements); opal_finalize_util (); return test_finalize(); }
int orte_daemon(int argc, char *argv[]) { int ret = 0; opal_cmd_line_t *cmd_line = NULL; int i; opal_buffer_t *buffer; char hostname[OPAL_MAXHOSTNAMELEN]; #if OPAL_ENABLE_FT_CR == 1 char *tmp_env_var = NULL; #endif /* initialize the globals */ memset(&orted_globals, 0, sizeof(orted_globals)); /* initialize the singleton died pipe to an illegal value so we can detect it was set */ orted_globals.singleton_died_pipe = -1; bucket = OBJ_NEW(opal_buffer_t); /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) { OBJ_RELEASE(cmd_line); exit(1); } mca_base_cmd_line_setup(cmd_line); if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false, argc, argv))) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args); free(args); OBJ_RELEASE(cmd_line); return ret; } /* * Since this process can now handle MCA/GMCA parameters, make sure to * process them. */ mca_base_cmd_line_process_args(cmd_line, &environ, &environ); /* Ensure that enough of OPAL is setup for us to be able to run */ /* * NOTE: (JJH) * We need to allow 'mca_base_cmd_line_process_args()' to process command * line arguments *before* calling opal_init_util() since the command * line could contain MCA parameters that affect the way opal_init_util() * functions. AMCA parameters are one such option normally received on the * command line that affect the way opal_init_util() behaves. * It is "safe" to call mca_base_cmd_line_process_args() before * opal_init_util() since mca_base_cmd_line_process_args() does *not* * depend upon opal_init_util() functionality. */ if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) { fprintf(stderr, "OPAL failed to initialize -- orted aborting\n"); exit(1); } /* save the environment for launch purposes. This MUST be * done so that we can pass it to any local procs we * spawn - otherwise, those local procs won't see any * non-MCA envars that were set in the enviro when the * orted was executed - e.g., by .csh */ orte_launch_environ = opal_argv_copy(environ); /* purge any ess/pmix flags set in the environ when we were launched */ opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ); opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ); /* if orte_daemon_debug is set, let someone know we are alive right * away just in case we have a problem along the way */ if (orted_globals.debug) { gethostname(hostname, sizeof(hostname)); fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname); } /* check for help request */ if (orted_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); orte_show_help("help-orted.txt", "orted:usage", false, argv[0], args); free(args); return 1; } #if defined(HAVE_SETSID) /* see if we were directed to separate from current session */ if (orted_globals.set_sid) { setsid(); } #endif /* see if they want us to spin until they can connect a debugger to us */ i=0; while (orted_spin_flag) { i++; if (1000 < i) i=0; } #if OPAL_ENABLE_FT_CR == 1 /* Mark as a tool program */ (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var); opal_setenv(tmp_env_var, "1", true, &environ); free(tmp_env_var); #endif /* detach from controlling terminal * otherwise, remain attached so output can get to us */ if(!orte_debug_flag && !orte_debug_daemons_flag && orted_globals.daemonize) { opal_daemon_init(NULL); } /* Set the flag telling OpenRTE that I am NOT a * singleton, but am "infrastructure" - prevents setting * up incorrect infrastructure that only a singleton would * require. */ if (orted_globals.hnp) { if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) { ORTE_ERROR_LOG(ret); return ret; } } else { if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) { ORTE_ERROR_LOG(ret); return ret; } } /* finalize the OPAL utils. As they are opened again from orte_init->opal_init * we continue to have a reference count on them. So we have to finalize them twice... */ opal_finalize_util(); /* bind ourselves if so directed */ if (NULL != orte_daemon_cores) { char **cores=NULL, tmp[128]; hwloc_obj_t pu; hwloc_cpuset_t ours, res; int core; /* could be a collection of comma-delimited ranges, so * use our handy utility to parse it */ orte_util_parse_range_options(orte_daemon_cores, &cores); if (NULL != cores) { ours = hwloc_bitmap_alloc(); hwloc_bitmap_zero(ours); res = hwloc_bitmap_alloc(); for (i=0; NULL != cores[i]; i++) { core = strtoul(cores[i], NULL, 10); if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) { /* turn off the show help forwarding as we won't * be able to cycle the event library to send */ orte_show_help_finalize(); /* the message will now come out locally */ orte_show_help("help-orted.txt", "orted:cannot-bind", true, orte_process_info.nodename, orte_daemon_cores); ret = ORTE_ERR_NOT_SUPPORTED; hwloc_bitmap_free(ours); hwloc_bitmap_free(res); goto DONE; } hwloc_bitmap_or(res, ours, pu->cpuset); hwloc_bitmap_copy(ours, res); } /* if the result is all zeros, then don't bind */ if (!hwloc_bitmap_iszero(ours)) { (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0); if (opal_hwloc_report_bindings) { opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours); opal_output(0, "Daemon %s is bound to cores %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); } } /* cleanup */ hwloc_bitmap_free(ours); hwloc_bitmap_free(res); opal_argv_free(cores); } } if ((int)ORTE_VPID_INVALID != orted_debug_failure) { orted_globals.abort=false; /* some vpid was ordered to fail. The value can be positive * or negative, depending upon the desired method for failure, * so need to check both here */ if (0 > orted_debug_failure) { orted_debug_failure = -1*orted_debug_failure; orted_globals.abort = true; } /* are we the specified vpid? */ if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) { /* if the user specified we delay, then setup a timer * and have it kill us */ if (0 < orted_debug_failure_delay) { ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI); } else { opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orted_globals.abort ? "abort" : "abnormal termination"); /* do -not- call finalize as this will send a message to the HNP * indicating clean termination! Instead, just forcibly cleanup * the local session_dir tree and exit */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* if we were ordered to abort, do so */ if (orted_globals.abort) { abort(); } /* otherwise, return with non-zero status */ ret = ORTE_ERROR_DEFAULT_EXIT_CODE; goto DONE; } } } /* insert our contact info into our process_info struct so we * have it for later use and set the local daemon field to our name */ orte_oob_base_get_addr(&orte_process_info.my_daemon_uri); if (NULL == orte_process_info.my_daemon_uri) { /* no way to communicate */ ret = ORTE_ERROR; goto DONE; } ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid; /* if I am also the hnp, then update that contact info field too */ if (ORTE_PROC_IS_HNP) { orte_process_info.my_hnp_uri = strdup(orte_process_info.my_daemon_uri); ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid; } /* setup the primary daemon command receive function */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_PERSISTENT, orte_daemon_recv, NULL); /* output a message indicating we are alive, our name, and our pid * for debugging purposes */ if (orte_debug_daemons_flag) { fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid, orte_process_info.nodename); } /* We actually do *not* want the orted to voluntarily yield() the processor more than necessary. The orted already blocks when it is doing nothing, so it doesn't use any more CPU cycles than it should; but when it *is* doing something, we do not want it to be unnecessarily delayed because it voluntarily yielded the processor in the middle of its work. For example: when a message arrives at the orted, we want the OS to wake up the orted in a timely fashion (which most OS's seem good about doing) and then we want the orted to process the message as fast as possible. If the orted yields and lets aggressive MPI applications get the processor back, it may be a long time before the OS schedules the orted to run again (particularly if there is no IO event to wake it up). Hence, routed OOB messages (for example) may be significantly delayed before being delivered to MPI processes, which can be problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); /* Change the default behavior of libevent such that we want to continually block rather than blocking for the default timeout and then looping around the progress engine again. There should be nothing in the orted that cannot block in libevent until "something" happens (i.e., there's no need to keep cycling through progress because the only things that should happen will happen in libevent). This is a minor optimization, but what the heck... :-) */ opal_progress_set_event_flag(OPAL_EVLOOP_ONCE); /* if requested, report my uri to the indicated pipe */ if (orted_globals.uri_pipe > 0) { orte_job_t *jdata; orte_proc_t *proc; orte_node_t *node; orte_app_context_t *app; char *tmp, *nptr, *sysinfo; char **singenv=NULL, *string_key, *env_str; /* setup the singleton's job */ jdata = OBJ_NEW(orte_job_t); /* default to ompi for now */ opal_argv_append_nosize(&jdata->personality, "ompi"); orte_plm_base_create_jobid(jdata); opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); /* must create a map for it (even though it has no * info in it) so that the job info will be picked * up in subsequent pidmaps or other daemons won't * know how to route */ jdata->map = OBJ_NEW(orte_job_map_t); /* setup an app_context for the singleton */ app = OBJ_NEW(orte_app_context_t); app->app = strdup("singleton"); app->num_procs = 1; opal_pointer_array_add(jdata->apps, app); jdata->num_apps = 1; /* setup a proc object for the singleton - since we * -must- be the HNP, and therefore we stored our * node on the global node pool, and since the singleton * -must- be on the same node as us, indicate that */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = jdata->jobid; proc->name.vpid = 0; proc->parent = 0; ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE); proc->state = ORTE_PROC_STATE_RUNNING; proc->app_idx = 0; /* obviously, it is on my node */ node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); proc->node = node; OBJ_RETAIN(node); /* keep accounting straight */ opal_pointer_array_add(jdata->procs, proc); jdata->num_procs = 1; /* add the node to the job map */ OBJ_RETAIN(node); opal_pointer_array_add(jdata->map->nodes, node); jdata->map->num_nodes++; /* and it obviously is on the node */ OBJ_RETAIN(proc); opal_pointer_array_add(node->procs, proc); node->num_procs++; /* and obviously it is one of my local procs */ OBJ_RETAIN(proc); opal_pointer_array_add(orte_local_children, proc); jdata->num_local_procs = 1; /* set the trivial */ proc->local_rank = 0; proc->node_rank = 0; proc->app_rank = 0; proc->state = ORTE_PROC_STATE_RUNNING; proc->app_idx = 0; ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL); /* set the ORTE_JOB_TRANSPORT_KEY from the environment */ orte_pre_condition_transports(jdata, NULL); /* register the singleton's nspace with our PMIx server */ if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) { ORTE_ERROR_LOG(ret); goto DONE; } /* use setup fork to create the envars needed by the singleton */ if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) { ORTE_ERROR_LOG(ret); goto DONE; } /* append the transport key to the envars needed by the singleton */ if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto DONE; } asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); opal_argv_append_nosize(&singenv, env_str); free(env_str); nptr = opal_argv_join(singenv, '*'); opal_argv_free(singenv); /* create a string that contains our uri + sysinfo + PMIx server URI envars */ orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model); asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, nptr); free(sysinfo); free(nptr); /* pass that info to the singleton */ if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ; /* need to add 1 to get the NULL */ ORTE_ERROR_LOG(ret); goto DONE; } /* cleanup */ free(tmp); close(orted_globals.uri_pipe); /* since a singleton spawned us, we need to harvest * any MCA params from the local environment so * we can pass them along to any subsequent daemons * we may start as the result of a comm_spawn */ for (i=0; NULL != environ[i]; i++) { if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) { /* make a copy to manipulate */ tmp = strdup(environ[i]); /* find the equal sign */ nptr = strchr(tmp, '='); *nptr = '\0'; nptr++; /* add the mca param to the orted cmd line */ opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID); opal_argv_append_nosize(&orted_cmd_line, &tmp[9]); opal_argv_append_nosize(&orted_cmd_line, nptr); free(tmp); } } } /* if we were given a pipe to monitor for singleton termination, set that up */ if (orted_globals.singleton_died_pipe > 0) { /* register shutdown handler */ pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t)); opal_event_set(orte_event_base, pipe_handler, orted_globals.singleton_died_pipe, OPAL_EV_READ, pipe_closed, pipe_handler); opal_event_add(pipe_handler, NULL); } /* If I have a parent, then save his contact info so * any messages we send can flow thru him. */ orte_parent_uri = NULL; (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri", "URI for the parent if tree launch is enabled.", MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_INTERNAL, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_CONSTANT, &orte_parent_uri); if (NULL != orte_parent_uri) { orte_process_name_t parent; opal_value_t val; /* set the contact info into our local database */ ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL); if (ORTE_SUCCESS != ret) { ORTE_ERROR_LOG(ret); free (orte_parent_uri); orte_parent_uri = NULL; goto DONE; } OBJ_CONSTRUCT(&val, opal_value_t); val.key = OPAL_PMIX_PROC_URI; val.type = OPAL_STRING; val.data.string = orte_parent_uri; if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&parent, &val))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&val); goto DONE; } val.key = NULL; val.data.string = NULL; OBJ_DESTRUCT(&val); /* don't need this value anymore */ free(orte_parent_uri); orte_parent_uri = NULL; /* tell the routed module that we have a path * back to the HNP */ if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) { ORTE_ERROR_LOG(ret); goto DONE; } /* set the lifeline to point to our parent so that we * can handle the situation if that lifeline goes away */ if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) { ORTE_ERROR_LOG(ret); goto DONE; } } /* if we are not the HNP...the only time we will be an HNP * is if we are launched by a singleton to provide support * for it */ if (!ORTE_PROC_IS_HNP) { orte_process_name_t target; target.jobid = ORTE_PROC_MY_NAME->jobid; if (orte_fwd_mpirun_port || orte_static_ports) { /* setup the rollup callback */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK, ORTE_RML_PERSISTENT, rollup, NULL); target.vpid = ORTE_PROC_MY_NAME->vpid; /* since we will be waiting for any children to send us * their rollup info before sending to our parent, save * a little time in the launch phase by "warming up" the * connection to our parent while we wait for our children */ buffer = OBJ_NEW(opal_buffer_t); // zero-byte message if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit, ORTE_PROC_MY_PARENT, buffer, ORTE_RML_TAG_WARMUP_CONNECTION, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } else { target.vpid = 0; } /* send the information to the orted report-back point - this function * will process the data, but also counts the number of * orteds that reported back so the launch procedure can continue. * We need to do this at the last possible second as the HNP * can turn right around and begin issuing orders to us */ buffer = OBJ_NEW(opal_buffer_t); /* insert our name for rollup purposes */ if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } /* get any connection info we may have pushed */ { opal_value_t *val = NULL, *kv; opal_list_t *modex; int32_t flag; if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) { /* just pack a marker indicating we don't have any to share */ flag = 0; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } else { /* the data is returned as a list of key-value pairs in the opal_value_t */ if (OPAL_PTR == val->type) { modex = (opal_list_t*)val->data.ptr; flag = (int32_t)opal_list_get_size(modex); if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } OPAL_LIST_FOREACH(kv, modex, opal_value_t) { if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &kv, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } OPAL_LIST_RELEASE(modex); } else { opal_output(0, "VAL KEY: %s", (NULL == val->key) ? "NULL" : val->key); /* single value */ flag = 1; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &val, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); goto DONE; } } OBJ_RELEASE(val); }
int main(int argc, char *argv[]) { int exit_status = 0, ret, flags = 0, i; int exec_argc = 0, user_argc = 0; char **exec_argv = NULL, **user_argv = NULL; char *exec_command, *base_argv0 = NULL; bool disable_flags = true; bool real_flag = false; if (OPAL_SUCCESS != (ret = opal_init_util())) { return ret; } /**************************************************** * * Setup compiler information * ****************************************************/ base_argv0 = opal_basename(argv[0]); #if defined(EXEEXT) if( 0 != strlen(EXEEXT) ) { char extension[] = EXEEXT; char* temp = strstr( base_argv0, extension ); char* old_match = temp; while( NULL != temp ) { old_match = temp; temp = strstr( temp + 1, extension ); } /* Only if there was a match of .exe, erase the last occurence of .exe */ if ( NULL != old_match ) { *old_match = '\0'; } } #endif /* defined(EXEEXT) */ if (OPAL_SUCCESS != (ret = data_init(base_argv0))) { fprintf(stderr, "Error parsing data file %s: %s\n", base_argv0, opal_strerror(ret)); return ret; } for (i = 1 ; i < argc && user_data_idx < 0 ; ++i) { user_data_idx = find_options_index(argv[i]); } /* if we didn't find a match, look for the NULL (base case) options */ if (user_data_idx < 0) { user_data_idx = default_data_idx; } /* if we still didn't find a match, abort */ if (user_data_idx < 0) { char *flat = opal_argv_join(argv, ' '); opal_show_help("help-opal-wrapper.txt", "no-options-support", true, base_argv0, flat, NULL); free(flat); exit(1); } /* compiler */ load_env_data(options_data[user_data_idx].project_short, options_data[user_data_idx].compiler_env, &options_data[user_data_idx].compiler); /* preprocessor flags */ load_env_data_argv(options_data[user_data_idx].project_short, "CPPFLAGS", &options_data[user_data_idx].preproc_flags); /* compiler flags */ load_env_data_argv(options_data[user_data_idx].project_short, options_data[user_data_idx].compiler_flags_env, &options_data[user_data_idx].comp_flags); /* linker flags */ load_env_data_argv(options_data[user_data_idx].project_short, "LDFLAGS", &options_data[user_data_idx].link_flags); /* libs */ load_env_data_argv(options_data[user_data_idx].project_short, "LIBS", &options_data[user_data_idx].libs); /**************************************************** * * Sanity Checks * ****************************************************/ if (NULL != options_data[user_data_idx].req_file) { /* make sure the language is supported */ if (0 == strcmp(options_data[user_data_idx].req_file, "not supported")) { opal_show_help("help-opal-wrapper.txt", "no-language-support", true, options_data[user_data_idx].language, base_argv0, NULL); exit_status = 1; goto cleanup; } if (options_data[user_data_idx].req_file[0] != '\0') { char *filename; struct stat buf; filename = opal_os_path( false, options_data[user_data_idx].path_libdir, options_data[user_data_idx].req_file, NULL ); if (0 != stat(filename, &buf)) { opal_show_help("help-opal-wrapper.txt", "file-not-found", true, base_argv0, options_data[user_data_idx].req_file, options_data[user_data_idx].language, NULL); } } } /**************************************************** * * Parse user flags * ****************************************************/ flags = COMP_WANT_COMMAND|COMP_WANT_PREPROC| COMP_WANT_COMPILE|COMP_WANT_LINK; user_argv = opal_argv_copy(argv + 1); user_argc = opal_argv_count(user_argv); for (i = 0 ; i < user_argc ; ++i) { if (0 == strncmp(user_argv[i], "-showme", strlen("-showme")) || 0 == strncmp(user_argv[i], "--showme", strlen("--showme")) || 0 == strncmp(user_argv[i], "-show", strlen("-show")) || 0 == strncmp(user_argv[i], "--show", strlen("--show"))) { bool done_now = false; /* check for specific things we want to see. First three still invoke all the building routines. Last set want to parse out certain flags, so we don't go through the normal build routine - skip to cleanup. */ if (0 == strncmp(user_argv[i], "-showme:command", strlen("-showme:command")) || 0 == strncmp(user_argv[i], "--showme:command", strlen("--showme:command"))) { flags = COMP_WANT_COMMAND; /* we know what we want, so don't process any more args */ done_now = true; } else if (0 == strncmp(user_argv[i], "-showme:compile", strlen("-showme:compile")) || 0 == strncmp(user_argv[i], "--showme:compile", strlen("--showme:compile"))) { flags = COMP_WANT_PREPROC|COMP_WANT_COMPILE; /* we know what we want, so don't process any more args */ done_now = true; } else if (0 == strncmp(user_argv[i], "-showme:link", strlen("-showme:link")) || 0 == strncmp(user_argv[i], "--showme:link", strlen("--showme:link"))) { flags = COMP_WANT_COMPILE|COMP_WANT_LINK; /* we know what we want, so don't process any more args */ done_now = true; } else if (0 == strncmp(user_argv[i], "-showme:incdirs", strlen("-showme:incdirs")) || 0 == strncmp(user_argv[i], "--showme:incdirs", strlen("--showme:incdirs"))) { print_flags(options_data[user_data_idx].preproc_flags, OPAL_INCLUDE_FLAG); goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:libdirs", strlen("-showme:libdirs")) || 0 == strncmp(user_argv[i], "--showme:libdirs", strlen("--showme:libdirs"))) { print_flags(options_data[user_data_idx].link_flags, OPAL_LIBDIR_FLAG); goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:libs", strlen("-showme:libs")) || 0 == strncmp(user_argv[i], "--showme:libs", strlen("--showme:libs"))) { print_flags(options_data[user_data_idx].libs, "-l"); goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:version", strlen("-showme:version")) || 0 == strncmp(user_argv[i], "--showme:version", strlen("--showme:version"))) { opal_show_help("help-opal-wrapper.txt", "version", false, argv[0], options_data[user_data_idx].project, options_data[user_data_idx].version, options_data[user_data_idx].language, NULL); goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:", strlen("-showme:")) || 0 == strncmp(user_argv[i], "--showme:", strlen("--showme:"))) { opal_show_help("help-opal-wrapper.txt", "usage", true, argv[0], options_data[user_data_idx].project, NULL); goto cleanup; } flags |= (COMP_DRY_RUN|COMP_SHOW_ERROR); /* remove element from user_argv */ opal_argv_delete(&user_argc, &user_argv, i, 1); --i; if (done_now) { disable_flags = false; break; } } else if (0 == strcmp(user_argv[i], "-c")) { flags &= ~COMP_WANT_LINK; real_flag = true; } else if (0 == strcmp(user_argv[i], "-E") || 0 == strcmp(user_argv[i], "-M")) { flags &= ~(COMP_WANT_COMPILE | COMP_WANT_LINK); real_flag = true; } else if (0 == strcmp(user_argv[i], "-S")) { flags &= ~COMP_WANT_LINK; real_flag = true; } else if (0 == strcmp(user_argv[i], "-lpmpi")) { flags |= COMP_WANT_PMPI; /* remove element from user_argv */ opal_argv_delete(&user_argc, &user_argv, i, 1); --i; } else if ('-' != user_argv[i][0]) { disable_flags = false; flags |= COMP_SHOW_ERROR; real_flag = true; } else { /* if the option flag is one that we use to determine which set of compiler data to use, don't count it as a real option */ if (find_options_index(user_argv[i]) < 0) { real_flag = true; } } } /* clear out the want_flags if we got no arguments not starting with a - (dash) and -showme wasn't given OR -showme was given and we had at least one more non-showme argument that started with a - (dash) and no other non-dash arguments. Some examples: opal_wrapper : clear our flags opal_wrapper -v : clear our flags opal_wrapper -E a.c : don't clear our flags opal_wrapper a.c : don't clear our flags opal_wrapper -showme : don't clear our flags opal_wrapper -showme -v : clear our flags opal_wrapper -showme -E a.c : don't clear our flags opal_wrapper -showme a.c : don't clear our flags */ if (disable_flags && !((flags & COMP_DRY_RUN) && !real_flag)) { flags &= ~(COMP_WANT_PREPROC|COMP_WANT_COMPILE|COMP_WANT_LINK); } #if !OMPI_ENABLE_MPI_PROFILING /* sanity check */ if (flags & COMP_WANT_PMPI) { opal_show_help("help-opal-wrapper.txt", "no-profiling-support", true, argv[0], NULL); } #endif /**************************************************** * * Assemble the command line * ****************************************************/ /* compiler (may be multiple arguments, so split) */ if (flags & COMP_WANT_COMMAND) { exec_argv = opal_argv_split(options_data[user_data_idx].compiler, ' '); exec_argc = opal_argv_count(exec_argv); } else { exec_argv = (char **) malloc(sizeof(char*)); exec_argv[0] = NULL; exec_argc = 0; } /* Per https://svn.open-mpi.org/trac/ompi/ticket/2201, add all the user arguments before anything else. */ opal_argv_insert(&exec_argv, exec_argc, user_argv); exec_argc = opal_argv_count(exec_argv); /* preproc flags */ if (flags & COMP_WANT_PREPROC) { opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].preproc_flags); exec_argc = opal_argv_count(exec_argv); } /* compiler flags */ if (flags & COMP_WANT_COMPILE) { opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].comp_flags); /* Deal with languages like Fortran 90 that have special places and flags for modules or whatever */ if (options_data[user_data_idx].module_option != NULL) { char *line; asprintf(&line, "%s%s", options_data[user_data_idx].module_option, options_data[user_data_idx].path_libdir); opal_argv_append_nosize(&exec_argv, line); free(line); } exec_argc = opal_argv_count(exec_argv); } /* link flags and libs */ if (flags & COMP_WANT_LINK) { opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].link_flags); exec_argc = opal_argv_count(exec_argv); opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].libs); exec_argc = opal_argv_count(exec_argv); } /**************************************************** * * Execute the command * ****************************************************/ if (flags & COMP_DRY_RUN) { exec_command = opal_argv_join(exec_argv, ' '); printf("%s\n", exec_command); } else { char *tmp; #if 0 exec_command = opal_argv_join(exec_argv, ' '); printf("command: %s\n", exec_command); #endif tmp = opal_path_findv(exec_argv[0], 0, environ, NULL); if (NULL == tmp) { opal_show_help("help-opal-wrapper.txt", "no-compiler-found", true, exec_argv[0], NULL); errno = 0; exit_status = 1; } else { int status; free(exec_argv[0]); exec_argv[0] = tmp; ret = opal_few(exec_argv, &status); exit_status = WIFEXITED(status) ? WEXITSTATUS(status) : (WIFSIGNALED(status) ? WTERMSIG(status) : (WIFSTOPPED(status) ? WSTOPSIG(status) : 255)); if( (OPAL_SUCCESS != ret) || ((0 != exit_status) && (flags & COMP_SHOW_ERROR)) ) { char* exec_command = opal_argv_join(exec_argv, ' '); if( OPAL_SUCCESS != ret ) { opal_show_help("help-opal-wrapper.txt", "spawn-failed", true, exec_argv[0], strerror(status), exec_command, NULL); } else { #if 0 opal_show_help("help-opal-wrapper.txt", "compiler-failed", true, exec_argv[0], exit_status, exec_command, NULL); #endif } free(exec_command); } } } /**************************************************** * * Cleanup * ****************************************************/ cleanup: opal_argv_free(exec_argv); opal_argv_free(user_argv); if (NULL != base_argv0) free(base_argv0); if (OPAL_SUCCESS != (ret = data_finalize())) { return ret; } if (OPAL_SUCCESS != (ret = opal_finalize_util())) { return ret; } return exit_status; }
int main(int argc, char *argv[]) { int exit_status = 0, ret, flags = 0, i; int exec_argc = 0, user_argc = 0; char **exec_argv = NULL, **user_argv = NULL; char *exec_command, *base_argv0 = NULL; bool disable_flags = true; bool real_flag = false; if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) { return ret; } /**************************************************** * * Setup compiler information * ****************************************************/ base_argv0 = opal_basename(argv[0]); #if defined(EXEEXT) if( 0 != strlen(EXEEXT) ) { char extension[] = EXEEXT; char* temp = strstr( base_argv0, extension ); char* old_match = temp; while( NULL != temp ) { old_match = temp; temp = strstr( temp + 1, extension ); } /* Only if there was a match of .exe, erase the last occurence of .exe */ if ( NULL != old_match ) { *old_match = '\0'; } } #endif /* defined(EXEEXT) */ if (OPAL_SUCCESS != (ret = data_init(base_argv0))) { fprintf(stderr, "Error parsing data file %s: %s\n", base_argv0, opal_strerror(ret)); return ret; } for (i = 1 ; i < argc && user_data_idx < 0 ; ++i) { user_data_idx = find_options_index(argv[i]); } /* if we didn't find a match, look for the NULL (base case) options */ if (user_data_idx < 0) { user_data_idx = default_data_idx; } /* if we still didn't find a match, abort */ if (user_data_idx < 0) { char *flat = opal_argv_join(argv, ' '); opal_show_help("help-opal-wrapper.txt", "no-options-support", true, base_argv0, flat, NULL); free(flat); exit(1); } /* compiler */ load_env_data(options_data[user_data_idx].project_short, options_data[user_data_idx].compiler_env, &options_data[user_data_idx].compiler); /* preprocessor flags */ load_env_data_argv(options_data[user_data_idx].project_short, "CPPFLAGS", &options_data[user_data_idx].preproc_flags); /* compiler flags */ load_env_data_argv(options_data[user_data_idx].project_short, options_data[user_data_idx].compiler_flags_env, &options_data[user_data_idx].comp_flags); /* linker flags */ load_env_data_argv(options_data[user_data_idx].project_short, "LDFLAGS", &options_data[user_data_idx].link_flags); /* libs */ load_env_data_argv(options_data[user_data_idx].project_short, "LIBS", &options_data[user_data_idx].libs); /**************************************************** * * Sanity Checks * ****************************************************/ if (NULL != options_data[user_data_idx].req_file) { /* make sure the language is supported */ if (0 == strcmp(options_data[user_data_idx].req_file, "not supported")) { opal_show_help("help-opal-wrapper.txt", "no-language-support", true, options_data[user_data_idx].language, base_argv0, NULL); exit_status = 1; goto cleanup; } if (options_data[user_data_idx].req_file[0] != '\0') { char *filename; struct stat buf; filename = opal_os_path( false, options_data[user_data_idx].path_libdir, options_data[user_data_idx].req_file, NULL ); if (0 != stat(filename, &buf)) { opal_show_help("help-opal-wrapper.txt", "file-not-found", true, base_argv0, options_data[user_data_idx].req_file, options_data[user_data_idx].language, NULL); } } } /**************************************************** * * Parse user flags * ****************************************************/ flags = COMP_WANT_COMMAND|COMP_WANT_PREPROC| COMP_WANT_COMPILE|COMP_WANT_LINK; user_argv = opal_argv_copy(argv + 1); user_argc = opal_argv_count(user_argv); for (i = 0 ; i < user_argc ; ++i) { if (0 == strncmp(user_argv[i], "-showme", strlen("-showme")) || 0 == strncmp(user_argv[i], "--showme", strlen("--showme")) || 0 == strncmp(user_argv[i], "-show", strlen("-show")) || 0 == strncmp(user_argv[i], "--show", strlen("--show"))) { bool done_now = false; /* check for specific things we want to see. First three still invoke all the building routines. Last set want to parse out certain flags, so we don't go through the normal build routine - skip to cleanup. */ if (0 == strncmp(user_argv[i], "-showme:command", strlen("-showme:command")) || 0 == strncmp(user_argv[i], "--showme:command", strlen("--showme:command"))) { flags = COMP_WANT_COMMAND; /* we know what we want, so don't process any more args */ done_now = true; } else if (0 == strncmp(user_argv[i], "-showme:compile", strlen("-showme:compile")) || 0 == strncmp(user_argv[i], "--showme:compile", strlen("--showme:compile"))) { flags = COMP_WANT_PREPROC|COMP_WANT_COMPILE; /* we know what we want, so don't process any more args */ done_now = true; } else if (0 == strncmp(user_argv[i], "-showme:link", strlen("-showme:link")) || 0 == strncmp(user_argv[i], "--showme:link", strlen("--showme:link"))) { flags = COMP_WANT_COMPILE|COMP_WANT_LINK; /* we know what we want, so don't process any more args */ done_now = true; } else if (0 == strncmp(user_argv[i], "-showme:incdirs", strlen("-showme:incdirs")) || 0 == strncmp(user_argv[i], "--showme:incdirs", strlen("--showme:incdirs"))) { print_flags(options_data[user_data_idx].preproc_flags, OPAL_INCLUDE_FLAG); goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:libdirs", strlen("-showme:libdirs")) || 0 == strncmp(user_argv[i], "--showme:libdirs", strlen("--showme:libdirs"))) { print_flags(options_data[user_data_idx].link_flags, OPAL_LIBDIR_FLAG); goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:libs", strlen("-showme:libs")) || 0 == strncmp(user_argv[i], "--showme:libs", strlen("--showme:libs"))) { print_flags(options_data[user_data_idx].libs, "-l"); goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:version", strlen("-showme:version")) || 0 == strncmp(user_argv[i], "--showme:version", strlen("--showme:version"))) { char * str; str = opal_show_help_string("help-opal-wrapper.txt", "version", false, argv[0], options_data[user_data_idx].project, options_data[user_data_idx].version, options_data[user_data_idx].language, NULL); if (NULL != str) { printf("%s", str); free(str); } goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:help", strlen("-showme:help")) || 0 == strncmp(user_argv[i], "--showme:help", strlen("--showme:help"))) { char *str; str = opal_show_help_string("help-opal-wrapper.txt", "usage", false, argv[0], options_data[user_data_idx].project, NULL); if (NULL != str) { printf("%s", str); free(str); } exit_status = 0; goto cleanup; } else if (0 == strncmp(user_argv[i], "-showme:", strlen("-showme:")) || 0 == strncmp(user_argv[i], "--showme:", strlen("--showme:"))) { fprintf(stderr, "%s: unrecognized option: %s\n", argv[0], user_argv[i]); fprintf(stderr, "Type '%s --showme:help' for usage.\n", argv[0]); exit_status = 1; goto cleanup; } flags |= (COMP_DRY_RUN|COMP_SHOW_ERROR); /* remove element from user_argv */ opal_argv_delete(&user_argc, &user_argv, i, 1); --i; if (done_now) { disable_flags = false; break; } } else if (0 == strcmp(user_argv[i], "-c")) { flags &= ~COMP_WANT_LINK; real_flag = true; } else if (0 == strcmp(user_argv[i], "-E") || 0 == strcmp(user_argv[i], "-M")) { flags &= ~(COMP_WANT_COMPILE | COMP_WANT_LINK); real_flag = true; } else if (0 == strcmp(user_argv[i], "-S")) { flags &= ~COMP_WANT_LINK; real_flag = true; } else if (0 == strcmp(user_argv[i], "-lpmpi")) { flags |= COMP_WANT_PMPI; /* remove element from user_argv */ opal_argv_delete(&user_argc, &user_argv, i, 1); --i; } else if (0 == strcmp(user_argv[i], "-static") || 0 == strcmp(user_argv[i], "--static") || 0 == strcmp(user_argv[i], "-Bstatic") || 0 == strcmp(user_argv[i], "-Wl,-static") || 0 == strcmp(user_argv[i], "-Wl,--static") || 0 == strcmp(user_argv[i], "-Wl,-Bstatic")) { flags |= COMP_WANT_STATIC; } else if (0 == strcmp(user_argv[i], "-dynamic") || 0 == strcmp(user_argv[i], "--dynamic") || 0 == strcmp(user_argv[i], "-Bdynamic") || 0 == strcmp(user_argv[i], "-Wl,-dynamic") || 0 == strcmp(user_argv[i], "-Wl,--dynamic") || 0 == strcmp(user_argv[i], "-Wl,-Bdynamic")) { flags &= ~COMP_WANT_STATIC; } else if (0 == strcmp(user_argv[i], "--openmpi:linkall")) { /* This is an intentionally undocummented wrapper compiler switch. It should only be used by Open MPI developers -- not end users. It will cause mpicc to use the static library list, even if we're compiling dynamically (i.e., it'll specifically -lopen-rte and -lopen-pal (and all their dependent libs)). We provide this flag for test MPI applications that also invoke ORTE and/or OPAL function calls. On some systems (e.g., OS X), if the top-level application calls ORTE/OPAL functions and you don't -l ORTE and OPAL, then the functions won't be resolved at link time (i.e., the implicit library dependencies of libmpi won't be pulled in at link time), and therefore the link will fail. This flag will cause the wrapper to explicitly list the ORTE and OPAL libs on the underlying compiler command line, so the application will therefore link properly. */ flags |= COMP_WANT_LINKALL; /* remove element from user_argv */ opal_argv_delete(&user_argc, &user_argv, i, 1); } else if ('-' != user_argv[i][0]) { disable_flags = false; flags |= COMP_SHOW_ERROR; real_flag = true; } else { /* if the option flag is one that we use to determine which set of compiler data to use, don't count it as a real option */ if (find_options_index(user_argv[i]) < 0) { real_flag = true; } } } /* clear out the want_flags if we got no arguments not starting with a - (dash) and -showme wasn't given OR -showme was given and we had at least one more non-showme argument that started with a - (dash) and no other non-dash arguments. Some examples: opal_wrapper : clear our flags opal_wrapper -v : clear our flags opal_wrapper -E a.c : don't clear our flags opal_wrapper a.c : don't clear our flags opal_wrapper -showme : don't clear our flags opal_wrapper -showme -v : clear our flags opal_wrapper -showme -E a.c : don't clear our flags opal_wrapper -showme a.c : don't clear our flags */ if (disable_flags && !((flags & COMP_DRY_RUN) && !real_flag)) { flags &= ~(COMP_WANT_PREPROC|COMP_WANT_COMPILE|COMP_WANT_LINK); } /**************************************************** * * Assemble the command line * ****************************************************/ /* compiler (may be multiple arguments, so split) */ if (flags & COMP_WANT_COMMAND) { exec_argv = opal_argv_split(options_data[user_data_idx].compiler, ' '); exec_argc = opal_argv_count(exec_argv); } else { exec_argv = (char **) malloc(sizeof(char*)); exec_argv[0] = NULL; exec_argc = 0; } /* This error would normally not happen unless the user edits the wrapper data files manually */ if (NULL == exec_argv) { opal_show_help("help-opal-wrapper.txt", "no-compiler-specified", true); return 1; } if (flags & COMP_WANT_COMPILE) { opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].comp_flags_prefix); exec_argc = opal_argv_count(exec_argv); } /* Per https://svn.open-mpi.org/trac/ompi/ticket/2201, add all the user arguments before anything else. */ opal_argv_insert(&exec_argv, exec_argc, user_argv); exec_argc = opal_argv_count(exec_argv); /* preproc flags */ if (flags & COMP_WANT_PREPROC) { opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].preproc_flags); exec_argc = opal_argv_count(exec_argv); } /* compiler flags */ if (flags & COMP_WANT_COMPILE) { opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].comp_flags); exec_argc = opal_argv_count(exec_argv); } /* link flags and libs */ if (flags & COMP_WANT_LINK) { bool have_static_lib; bool have_dyn_lib; bool use_static_libs; char *filename1, *filename2; struct stat buf; opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].link_flags); exec_argc = opal_argv_count(exec_argv); /* Are we linking statically? If so, decide what libraries to list. It depends on two factors: 1. Was --static (etc.) specified? 2. Does OMPI have static, dynamic, or both libraries installed? Here's a matrix showing what we'll do in all 6 cases: What's installed --static no --static ---------------- ---------- ----------- ompi .so libs -lmpi -lmpi ompi .a libs all all ompi both libs all -lmpi */ filename1 = opal_os_path( false, options_data[user_data_idx].path_libdir, options_data[user_data_idx].static_lib_file, NULL ); if (0 == stat(filename1, &buf)) { have_static_lib = true; } else { have_static_lib = false; } filename2 = opal_os_path( false, options_data[user_data_idx].path_libdir, options_data[user_data_idx].dyn_lib_file, NULL ); if (0 == stat(filename2, &buf)) { have_dyn_lib = true; } else { have_dyn_lib = false; } /* Determine which set of libs to use: dynamic or static. Be pedantic to make the code easy to read. */ if (flags & COMP_WANT_LINKALL) { /* If --openmpi:linkall was specified, list all the libs (i.e., the static libs) if they're available, either in static or dynamic form. */ if (have_static_lib || have_dyn_lib) { use_static_libs = true; } else { fprintf(stderr, "The linkall option has failed as we were unable to find either static or dynamic libs\n" "Files looked for:\n Static: %s\n Dynamic: %s\n", filename1, filename2); free(filename1); free(filename2); exit(1); } } else if (flags & COMP_WANT_STATIC) { /* If --static (or something like it) was specified, if we have the static libs, then use them. Otherwise, use the dynamic libs. */ if (have_static_lib) { use_static_libs = true; } else { use_static_libs = false; } } else { /* If --static (or something like it) was NOT specified (or if --dyanic, or something like it, was specified), if we have the dynamic libs, then use them. Otherwise, use the static libs. */ if (have_dyn_lib) { use_static_libs = false; } else { use_static_libs = true; } } free(filename1); free(filename2); if (use_static_libs) { opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].libs_static); } else { opal_argv_insert(&exec_argv, exec_argc, options_data[user_data_idx].libs); } exec_argc = opal_argv_count(exec_argv); } /**************************************************** * * Execute the command * ****************************************************/ if (flags & COMP_DRY_RUN) { exec_command = opal_argv_join(exec_argv, ' '); printf("%s\n", exec_command); } else { char *tmp; #if 0 exec_command = opal_argv_join(exec_argv, ' '); printf("command: %s\n", exec_command); #endif tmp = opal_path_findv(exec_argv[0], 0, environ, NULL); if (NULL == tmp) { opal_show_help("help-opal-wrapper.txt", "no-compiler-found", true, exec_argv[0], NULL); errno = 0; exit_status = 1; } else { int status; free(exec_argv[0]); exec_argv[0] = tmp; ret = opal_few(exec_argv, &status); exit_status = WIFEXITED(status) ? WEXITSTATUS(status) : (WIFSIGNALED(status) ? WTERMSIG(status) : (WIFSTOPPED(status) ? WSTOPSIG(status) : 255)); if( (OPAL_SUCCESS != ret) || ((0 != exit_status) && (flags & COMP_SHOW_ERROR)) ) { char* exec_command = opal_argv_join(exec_argv, ' '); if( OPAL_SUCCESS != ret ) { opal_show_help("help-opal-wrapper.txt", "spawn-failed", true, exec_argv[0], strerror(status), exec_command, NULL); } else { #if 0 opal_show_help("help-opal-wrapper.txt", "compiler-failed", true, exec_argv[0], exit_status, exec_command, NULL); #endif } free(exec_command); } } } /**************************************************** * * Cleanup * ****************************************************/ cleanup: opal_argv_free(exec_argv); opal_argv_free(user_argv); if (NULL != base_argv0) free(base_argv0); if (OPAL_SUCCESS != (ret = data_finalize())) { return ret; } if (OPAL_SUCCESS != (ret = opal_finalize_util())) { return ret; } return exit_status; }
int ompi_mpi_finalize(void) { int ret = MPI_SUCCESS; opal_list_item_t *item; ompi_proc_t** procs; size_t nprocs; OPAL_TIMING_DECLARE(tm); OPAL_TIMING_INIT_EXT(&tm, OPAL_TIMING_GET_TIME_OF_DAY); /* Be a bit social if an erroneous program calls MPI_FINALIZE in two different threads, otherwise we may deadlock in ompi_comm_free() (or run into other nasty lions, tigers, or bears). This lock is held for the duration of ompi_mpi_init() and ompi_mpi_finalize(). Hence, if we get it, then no other thread is inside the critical section (and we don't have to check the *_started bool variables). */ opal_mutex_lock(&ompi_mpi_bootstrap_mutex); if (!ompi_mpi_initialized || ompi_mpi_finalized) { /* Note that if we're not initialized or already finalized, we cannot raise an MPI exception. The best that we can do is write something to stderr. */ char hostname[MAXHOSTNAMELEN]; pid_t pid = getpid(); gethostname(hostname, sizeof(hostname)); if (ompi_mpi_initialized) { opal_show_help("help-mpi-runtime.txt", "mpi_finalize: not initialized", true, hostname, pid); } else if (ompi_mpi_finalized) { opal_show_help("help-mpi-runtime.txt", "mpi_finalize:invoked_multiple_times", true, hostname, pid); } opal_mutex_unlock(&ompi_mpi_bootstrap_mutex); return MPI_ERR_OTHER; } ompi_mpi_finalize_started = true; ompi_mpiext_fini(); /* Per MPI-2:4.8, we have to free MPI_COMM_SELF before doing anything else in MPI_FINALIZE (to include setting up such that MPI_FINALIZED will return true). */ if (NULL != ompi_mpi_comm_self.comm.c_keyhash) { ompi_attr_delete_all(COMM_ATTR, &ompi_mpi_comm_self, ompi_mpi_comm_self.comm.c_keyhash); OBJ_RELEASE(ompi_mpi_comm_self.comm.c_keyhash); ompi_mpi_comm_self.comm.c_keyhash = NULL; } /* Proceed with MPI_FINALIZE */ ompi_mpi_finalized = true; /* As finalize is the last legal MPI call, we are allowed to force the release * of the user buffer used for bsend, before going anywhere further. */ (void)mca_pml_base_bsend_detach(NULL, NULL); #if OPAL_ENABLE_PROGRESS_THREADS == 0 opal_progress_set_event_flag(OPAL_EVLOOP_ONCE | OPAL_EVLOOP_NONBLOCK); #endif /* Redo ORTE calling opal_progress_event_users_increment() during MPI lifetime, to get better latency when not using TCP */ opal_progress_event_users_increment(); /* check to see if we want timing information */ OPAL_TIMING_MSTART((&tm,"time to execute finalize barrier")); /* NOTE: MPI-2.1 requires that MPI_FINALIZE is "collective" across *all* connected processes. This only means that all processes have to call it. It does *not* mean that all connected processes need to synchronize (either directly or indirectly). For example, it is quite easy to construct complicated scenarios where one job is "connected" to another job via transitivity, but have no direct knowledge of each other. Consider the following case: job A spawns job B, and job B later spawns job C. A "connectedness" graph looks something like this: A <--> B <--> C So what are we *supposed* to do in this case? If job A is still connected to B when it calls FINALIZE, should it block until jobs B and C also call FINALIZE? After lengthy discussions many times over the course of this project, the issue was finally decided at the Louisville Feb 2009 meeting: no. Rationale: - "Collective" does not mean synchronizing. It only means that every process call it. Hence, in this scenario, every process in A, B, and C must call FINALIZE. - KEY POINT: if A calls FINALIZE, then it is erroneous for B or C to try to communicate with A again. - Hence, OMPI is *correct* to only effect a barrier across each jobs' MPI_COMM_WORLD before exiting. Specifically, if A calls FINALIZE long before B or C, it's *correct* if A exits at any time (and doesn't notify B or C that it is exiting). - Arguably, if B or C do try to communicate with the now-gone A, OMPI should try to print a nice error ("you tried to communicate with a job that is already gone...") instead of segv or other Badness. However, that is an *extremely* difficult problem -- sure, it's easy for A to tell B that it is finalizing, but how can A tell C? A doesn't even know about C. You'd need to construct a "connected" graph in a distributed fashion, which is fraught with race conditions, etc. Hence, our conclusion is: OMPI is *correct* in its current behavior (of only doing a barrier across its own COMM_WORLD) before exiting. Any problems that occur are as a result of erroneous MPI applications. We *could* tighten up the erroneous cases and ensure that we print nice error messages / don't crash, but that is such a difficult problem that we decided we have many other, much higher priority issues to handle that deal with non-erroneous cases. */ /* Wait for everyone to reach this point. This is a grpcomm barrier instead of an MPI barrier for (at least) two reasons: 1. An MPI barrier doesn't ensure that all messages have been transmitted before exiting (e.g., a BTL can lie and buffer a message without actually injecting it to the network, and therefore require further calls to that BTL's progress), so the possibility of a stranded message exists. 2. If the MPI communication is using an unreliable transport, there's a problem of knowing that everyone has *left* the barrier. E.g., one proc can send its ACK to the barrier message to a peer and then leave the barrier, but the ACK can get lost and therefore the peer is left in the barrier. Point #1 has been known for a long time; point #2 emerged after we added the first unreliable BTL to Open MPI and fixed the del_procs behavior around May of 2014 (see https://svn.open-mpi.org/trac/ompi/ticket/4669#comment:4 for more details). */ opal_pmix.fence(NULL, 0); /* check for timing request - get stop time and report elapsed time if so */ OPAL_TIMING_MSTOP(&tm); OPAL_TIMING_DELTAS(ompi_enable_timing, &tm); OPAL_TIMING_REPORT(ompi_enable_timing_ext, &tm); OPAL_TIMING_RELEASE(&tm); /* * Shutdown the Checkpoint/Restart Mech. */ if (OMPI_SUCCESS != (ret = ompi_cr_finalize())) { OMPI_ERROR_LOG(ret); } /* Shut down any bindings-specific issues: C++, F77, F90 */ /* Remove all memory associated by MPI_REGISTER_DATAREP (per MPI-2:9.5.3, there is no way for an MPI application to *un*register datareps, but we don't want the OMPI layer causing memory leaks). */ while (NULL != (item = opal_list_remove_first(&ompi_registered_datareps))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(&ompi_registered_datareps); /* Remove all F90 types from the hash tables. As the OBJ_DESTRUCT will * call a special destructor able to release predefined types, we can * simply call the OBJ_DESTRUCT on the hash table and all memory will * be correctly released. */ OBJ_DESTRUCT( &ompi_mpi_f90_integer_hashtable ); OBJ_DESTRUCT( &ompi_mpi_f90_real_hashtable ); OBJ_DESTRUCT( &ompi_mpi_f90_complex_hashtable ); /* Free communication objects */ /* free file resources */ if (OMPI_SUCCESS != (ret = ompi_file_finalize())) { goto done; } /* free window resources */ if (OMPI_SUCCESS != (ret = ompi_win_finalize())) { goto done; } if (OMPI_SUCCESS != (ret = ompi_osc_base_finalize())) { goto done; } /* free communicator resources. this MUST come before finalizing the PML * as this will call into the pml */ if (OMPI_SUCCESS != (ret = ompi_comm_finalize())) { goto done; } /* call del_procs on all allocated procs even though some may not be known * to the pml layer. the pml layer is expected to be resilient and ignore * any unknown procs. */ nprocs = 0; procs = ompi_proc_get_allocated (&nprocs); MCA_PML_CALL(del_procs(procs, nprocs)); free(procs); /* free pml resource */ if(OMPI_SUCCESS != (ret = mca_pml_base_finalize())) { goto done; } /* free requests */ if (OMPI_SUCCESS != (ret = ompi_request_finalize())) { goto done; } if (OMPI_SUCCESS != (ret = ompi_message_finalize())) { goto done; } /* If requested, print out a list of memory allocated by ALLOC_MEM but not freed by FREE_MEM */ if (0 != ompi_debug_show_mpi_alloc_mem_leaks) { mca_mpool_base_tree_print(ompi_debug_show_mpi_alloc_mem_leaks); } /* Now that all MPI objects dealing with communications are gone, shut down MCA types having to do with communications */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_pml_base_framework) ) ) { OMPI_ERROR_LOG(ret); goto done; } /* shut down buffered send code */ mca_pml_base_bsend_fini(); #if OPAL_ENABLE_FT_CR == 1 /* * Shutdown the CRCP Framework, must happen after PML shutdown */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_crcp_base_framework) ) ) { OMPI_ERROR_LOG(ret); goto done; } #endif /* Free secondary resources */ /* free attr resources */ if (OMPI_SUCCESS != (ret = ompi_attr_finalize())) { goto done; } /* free group resources */ if (OMPI_SUCCESS != (ret = ompi_group_finalize())) { goto done; } /* finalize the DPM subsystem */ if ( OMPI_SUCCESS != (ret = ompi_dpm_finalize())) { goto done; } /* free internal error resources */ if (OMPI_SUCCESS != (ret = ompi_errcode_intern_finalize())) { goto done; } /* free error code resources */ if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_finalize())) { goto done; } /* free errhandler resources */ if (OMPI_SUCCESS != (ret = ompi_errhandler_finalize())) { goto done; } /* Free all other resources */ /* free op resources */ if (OMPI_SUCCESS != (ret = ompi_op_finalize())) { goto done; } /* free ddt resources */ if (OMPI_SUCCESS != (ret = ompi_datatype_finalize())) { goto done; } /* free info resources */ if (OMPI_SUCCESS != (ret = ompi_info_finalize())) { goto done; } /* Close down MCA modules */ /* io is opened lazily, so it's only necessary to close it if it was actually opened */ if (0 < ompi_io_base_framework.framework_refcnt) { /* May have been "opened" multiple times. We want it closed now */ ompi_io_base_framework.framework_refcnt = 1; if (OMPI_SUCCESS != mca_base_framework_close(&ompi_io_base_framework)) { goto done; } } (void) mca_base_framework_close(&ompi_topo_base_framework); if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_osc_base_framework))) { goto done; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_coll_base_framework))) { goto done; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_bml_base_framework))) { goto done; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_mpool_base_framework))) { goto done; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_rcache_base_framework))) { goto done; } if (OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_allocator_base_framework))) { goto done; } /* free proc resources */ if ( OMPI_SUCCESS != (ret = ompi_proc_finalize())) { goto done; } if (NULL != ompi_mpi_main_thread) { OBJ_RELEASE(ompi_mpi_main_thread); ompi_mpi_main_thread = NULL; } /* Clean up memory/resources from the MPI dynamic process functionality checker */ ompi_mpi_dynamics_finalize(); /* Leave the RTE */ if (OMPI_SUCCESS != (ret = ompi_rte_finalize())) { goto done; } ompi_rte_initialized = false; /* now close the rte framework */ if (OMPI_SUCCESS != (ret = mca_base_framework_close(&ompi_rte_base_framework) ) ) { OMPI_ERROR_LOG(ret); goto done; } if (OPAL_SUCCESS != (ret = opal_finalize_util())) { goto done; } /* All done */ done: opal_mutex_unlock(&ompi_mpi_bootstrap_mutex); return ret; }
int main(int argc, char *argv[]) { int ret = 0; bool want_help = false; bool cmd_error = false; bool acted = false; bool want_all = false; char **app_env = NULL, **global_env = NULL; int i, len; char *str; /* protect against problems if someone passes us thru a pipe * and then abnormally terminates the pipe early */ signal(SIGPIPE, SIG_IGN); /* Initialize the argv parsing handle */ if (ORTE_SUCCESS != opal_init_util(&argc, &argv)) { orte_show_help("help-orte-info.txt", "lib-call-fail", true, "opal_init_util", __FILE__, __LINE__, NULL); exit(ret); } orte_info_cmd_line = OBJ_NEW(opal_cmd_line_t); if (NULL == orte_info_cmd_line) { ret = errno; orte_show_help("help-orte-info.txt", "lib-call-fail", true, "opal_cmd_line_create", __FILE__, __LINE__, NULL); opal_finalize_util(); exit(ret); } opal_cmd_line_make_opt3(orte_info_cmd_line, 'v', NULL, "version", 2, "Show version of ORTE or a component. The first parameter can be the keywords \"orte\" or \"all\", a framework name (indicating all components in a framework), or a framework:component string (indicating a specific component). The second parameter can be one of: full, major, minor, release, greek, svn."); opal_cmd_line_make_opt3(orte_info_cmd_line, '\0', NULL, "param", 2, "Show MCA parameters. The first parameter is the framework (or the keyword \"all\"); the second parameter is the specific component name (or the keyword \"all\")."); opal_cmd_line_make_opt3(orte_info_cmd_line, '\0', NULL, "internal", 0, "Show internal MCA parameters (not meant to be modified by users)"); opal_cmd_line_make_opt3(orte_info_cmd_line, '\0', NULL, "path", 1, "Show paths that Open MPI was configured with. Accepts the following parameters: prefix, bindir, libdir, incdir, mandir, pkglibdir, sysconfdir"); opal_cmd_line_make_opt3(orte_info_cmd_line, '\0', NULL, "arch", 0, "Show architecture Open MPI was corteled on"); opal_cmd_line_make_opt3(orte_info_cmd_line, 'c', NULL, "config", 0, "Show configuration options"); opal_cmd_line_make_opt3(orte_info_cmd_line, 'h', NULL, "help", 0, "Show this help message"); opal_cmd_line_make_opt3(orte_info_cmd_line, '\0', NULL, "orte_info_pretty", 0, "When used in conjunction with other parameters, the output is displayed in 'orte_info_prettyprint' format (default)"); opal_cmd_line_make_opt3(orte_info_cmd_line, '\0', NULL, "parsable", 0, "When used in conjunction with other parameters, the output is displayed in a machine-parsable format"); opal_cmd_line_make_opt3(orte_info_cmd_line, '\0', NULL, "parseable", 0, "Synonym for --parsable"); opal_cmd_line_make_opt3(orte_info_cmd_line, '\0', NULL, "hostname", 0, "Show the hostname that Open MPI was configured " "and built on"); opal_cmd_line_make_opt3(orte_info_cmd_line, 'a', NULL, "all", 0, "Show all configuration options and MCA parameters"); /* Call some useless functions in order to guarantee to link in some * global variables. Only check the return value so that the * corteler doesn't optimize out the useless function. */ if (ORTE_SUCCESS != orte_locks_init()) { /* Stop .. or I'll say stop again! */ ++ret; } else { --ret; } /* set our threading level */ opal_set_using_threads(false); /* Get MCA parameters, if any */ if( ORTE_SUCCESS != mca_base_open() ) { orte_show_help("help-orte-info.txt", "lib-call-fail", true, "mca_base_open", __FILE__, __LINE__ ); OBJ_RELEASE(orte_info_cmd_line); opal_finalize_util(); exit(1); } mca_base_cmd_line_setup(orte_info_cmd_line); /* Do the parsing */ ret = opal_cmd_line_parse(orte_info_cmd_line, false, argc, argv); if (OPAL_SUCCESS != ret) { if (OPAL_ERR_SILENT != ret) { fprintf(stderr, "%s: command line error (%s)\n", argv[0], opal_strerror(ret)); } cmd_error = true; } if (!cmd_error && (opal_cmd_line_is_taken(orte_info_cmd_line, "help") || opal_cmd_line_is_taken(orte_info_cmd_line, "h"))) { char *str, *usage; want_help = true; usage = opal_cmd_line_get_usage_msg(orte_info_cmd_line); str = opal_show_help_string("help-orte-info.txt", "usage", true, usage); if (NULL != str) { printf("%s", str); free(str); } free(usage); } if (cmd_error || want_help) { mca_base_close(); OBJ_RELEASE(orte_info_cmd_line); opal_finalize_util(); exit(cmd_error ? 1 : 0); } mca_base_cmd_line_process_args(orte_info_cmd_line, &app_env, &global_env); /* putenv() all the stuff that we got back from env (in case the * user specified some --mca params on the command line). This * creates a memory leak, but that's unfortunately how putenv() * works. :-( */ len = opal_argv_count(app_env); for (i = 0; i < len; ++i) { putenv(app_env[i]); } len = opal_argv_count(global_env); for (i = 0; i < len; ++i) { putenv(global_env[i]); } /* setup the mca_types array */ OBJ_CONSTRUCT(&mca_types, opal_pointer_array_t); opal_pointer_array_init(&mca_types, 256, INT_MAX, 128); opal_info_register_types(&mca_types); orte_info_register_types(&mca_types); /* Execute the desired action(s) */ if (opal_cmd_line_is_taken(orte_info_cmd_line, "orte_info_pretty")) { orte_info_pretty = true; } else if (opal_cmd_line_is_taken(orte_info_cmd_line, "parsable") || opal_cmd_line_is_taken(orte_info_cmd_line, "parseable")) { orte_info_pretty = false; } want_all = opal_cmd_line_is_taken(orte_info_cmd_line, "all"); if (want_all || opal_cmd_line_is_taken(orte_info_cmd_line, "version")) { orte_info_do_version(want_all, orte_info_cmd_line); acted = true; } if (want_all || opal_cmd_line_is_taken(orte_info_cmd_line, "path")) { orte_info_do_path(want_all, orte_info_cmd_line); acted = true; } if (want_all || opal_cmd_line_is_taken(orte_info_cmd_line, "arch")) { orte_info_do_arch(); acted = true; } if (want_all || opal_cmd_line_is_taken(orte_info_cmd_line, "hostname")) { orte_info_do_hostname(); acted = true; } if (want_all || opal_cmd_line_is_taken(orte_info_cmd_line, "config")) { orte_info_do_config(true); acted = true; } if (want_all || opal_cmd_line_is_taken(orte_info_cmd_line, "param")) { orte_info_do_params(want_all, opal_cmd_line_is_taken(orte_info_cmd_line, "internal")); acted = true; } /* If no command line args are specified, show default set */ if (!acted) { orte_info_show_orte_version(orte_info_ver_full); orte_info_show_path(orte_info_path_prefix, opal_install_dirs.prefix); orte_info_do_arch(); orte_info_do_hostname(); orte_info_do_config(false); orte_info_components_open(); for (i = 0; i < mca_types.size; ++i) { if (NULL == (str = (char*)opal_pointer_array_get_item(&mca_types, i))) { continue; } orte_info_show_component_version(str, orte_info_component_all, orte_info_ver_full, orte_info_type_all); } } /* All done */ if (NULL != app_env) { opal_argv_free(app_env); } if (NULL != global_env) { opal_argv_free(global_env); } orte_info_components_close (); OBJ_RELEASE(orte_info_cmd_line); OBJ_DESTRUCT(&mca_types); mca_base_close(); opal_finalize_util(); return 0; }