static int mca_btl_sctp_component_open(void) { #ifdef __WINDOWS__ WSADATA win_sock_data; if (WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0) { BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError())); return OMPI_ERROR; } #endif /* initialize state */ mca_btl_sctp_component.sctp_listen_sd = -1; /* TODO different sd for ipv6 */ mca_btl_sctp_component.sctp_num_btls=0; /* addr_count */ mca_btl_sctp_component.sctp_btls=NULL; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_sctp_component.sctp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_sctp_component.sctp_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_btl_sctp_component.sctp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_sctp_component.sctp_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_sctp_component.sctp_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_sctp_component.sctp_frag_user, ompi_free_list_t); opal_hash_table_init(&mca_btl_sctp_component.sctp_procs, 256); #if MCA_BTL_SCTP_DONT_USE_HASH /* TODO make this only allocate how much it needs to. Currently * allocates 256 (to match sctp_procs). recvr_proc_table and * sender_proc_table are malloc'd in mca_btl_sctp_component_init. */ recvr_proc_table = NULL; sender_proc_table = NULL; #else OBJ_CONSTRUCT(&mca_btl_sctp_component.sctp_assocID_hash, opal_hash_table_t); opal_hash_table_init(&mca_btl_sctp_component.sctp_assocID_hash, 256); #endif /* if_include and if_exclude need to be mutually exclusive */ if (OPAL_SUCCESS != mca_base_param_check_exclusive_string( mca_btl_sctp_component.super.btl_version.mca_type_name, mca_btl_sctp_component.super.btl_version.mca_component_name, "if_include", mca_btl_sctp_component.super.btl_version.mca_type_name, mca_btl_sctp_component.super.btl_version.mca_component_name, "if_exclude")) { /* Return ERR_NOT_AVAILABLE so that a warning message about "open" failing is not printed */ return OMPI_ERR_NOT_AVAILABLE; } /* setup receive buffer */ if(0 == mca_btl_sctp_recv_handler_initbuf()) { return OMPI_ERR_OUT_OF_RESOURCE; } return OMPI_SUCCESS; }
/* * Initialize global variables used w/in this module. */ static void tcp_init(void) { /* setup the module's state variables */ OBJ_CONSTRUCT(&mca_oob_tcp_module.peers, opal_hash_table_t); opal_hash_table_init(&mca_oob_tcp_module.peers, 32); mca_oob_tcp_module.ev_active = false; if (orte_oob_base.use_module_threads) { /* if we are to use independent progress threads at * the module level, start it now */ opal_output_verbose(2, orte_oob_base_framework.framework_output, "%s STARTING TCP PROGRESS THREAD", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_tcp_module.ev_base = opal_event_base_create(); /* construct the thread object */ OBJ_CONSTRUCT(&mca_oob_tcp_module.progress_thread, opal_thread_t); /* fork off a thread to progress it */ mca_oob_tcp_module.progress_thread.t_run = progress_thread_engine; mca_oob_tcp_module.ev_active = true; if (OPAL_SUCCESS != opal_thread_start(&mca_oob_tcp_module.progress_thread)) { opal_output(0, "%s progress thread failed to start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } } }
mca_monitoring_coll_data_t*mca_common_monitoring_coll_new( ompi_communicator_t*comm ) { mca_monitoring_coll_data_t*data = OBJ_NEW(mca_monitoring_coll_data_t); if( NULL == data ) { OPAL_MONITORING_PRINT_ERR("coll: new: data structure cannot be allocated"); return NULL; } data->procs = NULL; data->comm_name = NULL; data->p_comm = comm; /* Allocate hashtable */ if( NULL == comm_data ) { comm_data = OBJ_NEW(opal_hash_table_t); if( NULL == comm_data ) { OPAL_MONITORING_PRINT_ERR("coll: new: failed to allocate hashtable"); return data; } opal_hash_table_init(comm_data, 2048); } /* Insert in hashtable */ uint64_t key = *((uint64_t*)&comm); if( OPAL_SUCCESS != opal_hash_table_set_value_uint64(comm_data, key, (void*)data) ) { OPAL_MONITORING_PRINT_ERR("coll: new: failed to allocate memory or " "growing the hash table"); } /* Cache data so the procs can be released without affecting the output */ mca_common_monitoring_coll_cache(data); return data; }
static mca_oob_t *mca_oob_ud_component_init(int *priority) { struct ibv_device **devices; int num_devices, i, rc; /* set the priority so that we will select this component * only if someone directs to do so */ *priority = 0; opal_hash_table_init (&mca_oob_ud_component.ud_peers, 1024); devices = ibv_get_device_list (&num_devices); if (NULL == devices || 0 == num_devices) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:component_init no devices found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return NULL; } for (i = 0 ; i < num_devices ; ++i) { mca_oob_ud_device_t *device = OBJ_NEW(mca_oob_ud_device_t); if (NULL == device) { opal_output (0, "oob:ud:component_init malloc failure. errno = %d", errno); return NULL; } rc = mca_oob_ud_device_setup (device, devices[i]); if (ORTE_SUCCESS != rc) { OBJ_RELEASE(device); continue; } opal_list_append (&mca_oob_ud_component.ud_devices, (opal_list_item_t *) device); /* NTH: support only 1 device for now */ break; } ibv_free_device_list (devices); if (0 == opal_list_get_size (&mca_oob_ud_component.ud_devices)) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:component_init no usable devices found.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return NULL; } /* have to call the module init here so we can test for available qpair */ if (ORTE_SUCCESS != mca_oob_ud_module_init()) { return NULL; } return &mca_oob_ud_module; }
/** * Function for finding and opening either all MCA components, * or the one that was specifically requested via a MCA parameter. */ static int orte_oob_base_open(mca_base_open_flag_t flags) { /* setup globals */ orte_oob_base.max_uri_length = -1; OBJ_CONSTRUCT(&orte_oob_base.peers, opal_hash_table_t); opal_hash_table_init(&orte_oob_base.peers, 128); OBJ_CONSTRUCT(&orte_oob_base.actives, opal_list_t); /* Open up all available components */ return mca_base_framework_components_open(&orte_oob_base_framework, flags); }
int orte_grpcomm_base_modex_init(void) { OBJ_CONSTRUCT(&mutex, opal_mutex_t); OBJ_CONSTRUCT(&cond, opal_condition_t); modex_data = OBJ_NEW(opal_hash_table_t); opal_hash_table_init(modex_data, 256); num_entries = 0; modex_buffer = OBJ_NEW(opal_buffer_t); return ORTE_SUCCESS; }
static int init(void) { OBJ_CONSTRUCT(&jobfam_list, opal_hash_table_t); opal_hash_table_init(&jobfam_list, 128); /* setup the global condition and lock */ OBJ_CONSTRUCT(&cond, opal_condition_t); OBJ_CONSTRUCT(&lock, opal_mutex_t); lifeline = NULL; return ORTE_SUCCESS; }
/** * This PML monitors only the processes in the MPI_COMM_WORLD. As OMPI is now lazily * adding peers on the first call to add_procs we need to check how many processes * are in the MPI_COMM_WORLD to create the storage with the right size. */ int mca_pml_monitoring_add_procs(struct ompi_proc_t **procs, size_t nprocs) { opal_process_name_t tmp, wp_name; size_t i, peer_rank, nprocs_world; uint64_t key; if(NULL == translation_ht) { translation_ht = OBJ_NEW(opal_hash_table_t); opal_hash_table_init(translation_ht, 2048); /* get my rank in the MPI_COMM_WORLD */ my_rank = ompi_comm_rank((ompi_communicator_t*)&ompi_mpi_comm_world); } nprocs_world = ompi_comm_size((ompi_communicator_t*)&ompi_mpi_comm_world); /* For all procs in the same MPI_COMM_WORLD we need to add them to the hash table */ for( i = 0; i < nprocs; i++ ) { /* Extract the peer procname from the procs array */ if( ompi_proc_is_sentinel(procs[i]) ) { tmp = ompi_proc_sentinel_to_name((uintptr_t)procs[i]); } else { tmp = procs[i]->super.proc_name; } if( tmp.jobid != ompi_proc_local_proc->super.proc_name.jobid ) continue; for( peer_rank = 0; peer_rank < nprocs_world; peer_rank++ ) { wp_name = ompi_group_get_proc_name(((ompi_communicator_t*)&ompi_mpi_comm_world)->c_remote_group, peer_rank); if( 0 != opal_compare_proc( tmp, wp_name) ) continue; /* Find the rank of the peer in MPI_COMM_WORLD */ key = *((uint64_t*)&tmp); /* store the rank (in COMM_WORLD) of the process with its name (a uniq opal ID) as key in the hash table*/ if( OPAL_SUCCESS != opal_hash_table_set_value_uint64(translation_ht, key, (void*)(uintptr_t)peer_rank) ) { return OMPI_ERR_OUT_OF_RESOURCE; /* failed to allocate memory or growing the hash table */ } break; } } return pml_selected_module.pml_add_procs(procs, nprocs); }
/* * This will initialize the main list to store key- attribute * items. This will be called one time, during MPI_INIT(). */ int ompi_attr_init(void) { int ret; void *bogus = (void*) 1; int *p = (int *) &bogus; keyval_hash = OBJ_NEW(opal_hash_table_t); if (NULL == keyval_hash) { return OMPI_ERR_OUT_OF_RESOURCE; } key_bitmap = OBJ_NEW(opal_bitmap_t); /* * Set the max size to OMPI_FORTRAN_HANDLE_MAX to enforce bound */ opal_bitmap_set_max_size (key_bitmap, OMPI_FORTRAN_HANDLE_MAX); if (0 != opal_bitmap_init(key_bitmap, 32)) { return OMPI_ERR_OUT_OF_RESOURCE; } for (int_pos = 0; int_pos < (sizeof(void*) / sizeof(int)); ++int_pos) { if (p[int_pos] == 1) { break; } } for (integer_pos = 0; integer_pos < (sizeof(void*) / sizeof(MPI_Fint)); ++integer_pos) { if (p[integer_pos] == 1) { break; } } OBJ_CONSTRUCT(&attribute_lock, opal_mutex_t); if (OMPI_SUCCESS != (ret = opal_hash_table_init(keyval_hash, ATTR_TABLE_SIZE))) { return ret; } if (OMPI_SUCCESS != (ret = ompi_attr_create_predefined())) { return ret; } return OMPI_SUCCESS; }
static int init(void) { OBJ_CONSTRUCT(&jobfam_list, opal_hash_table_t); opal_hash_table_init(&jobfam_list, 128); /* setup the global condition and lock */ OBJ_CONSTRUCT(&cond, opal_condition_t); OBJ_CONSTRUCT(&lock, opal_mutex_t); lifeline = NULL; /* setup the list of children */ OBJ_CONSTRUCT(&my_children, opal_list_t); num_children = 0; my_parent.jobid = ORTE_PROC_MY_NAME->jobid; return ORTE_SUCCESS; }
static int mca_btl_tcp_component_open(void) { #ifdef __WINDOWS__ WSADATA win_sock_data; if( WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0 ) { BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError())); return OMPI_ERROR; } #endif /* initialize state */ mca_btl_tcp_component.tcp_listen_sd = -1; #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_listen_sd = -1; #endif mca_btl_tcp_component.tcp_num_btls=0; mca_btl_tcp_component.tcp_addr_count = 0; mca_btl_tcp_component.tcp_btls=NULL; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t); opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256); /* if_include and if_exclude need to be mutually exclusive */ if (OPAL_SUCCESS != mca_base_param_check_exclusive_string( mca_btl_tcp_component.super.btl_version.mca_type_name, mca_btl_tcp_component.super.btl_version.mca_component_name, "if_include", mca_btl_tcp_component.super.btl_version.mca_type_name, mca_btl_tcp_component.super.btl_version.mca_component_name, "if_exclude")) { /* Return ERR_NOT_AVAILABLE so that a warning message about "open" failing is not printed */ return OMPI_ERR_NOT_AVAILABLE; } return OMPI_SUCCESS; }
int orte_grpcomm_base_purge_proc_attrs(void) { /* * Purge the attributes */ opal_hash_table_remove_all(modex_data); OBJ_RELEASE(modex_data); modex_data = OBJ_NEW(opal_hash_table_t); opal_hash_table_init(modex_data, 256); /* * Clear the modex buffer */ OBJ_RELEASE(modex_buffer); num_entries = 0; modex_buffer = OBJ_NEW(opal_buffer_t); return ORTE_SUCCESS; }
static int mca_btl_tcp_component_open(void) { if (OMPI_SUCCESS != mca_btl_tcp_component_verify()) { return OMPI_ERROR; } /* initialize state */ mca_btl_tcp_component.tcp_listen_sd = -1; #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_listen_sd = -1; #endif mca_btl_tcp_component.tcp_num_btls=0; mca_btl_tcp_component.tcp_addr_count = 0; mca_btl_tcp_component.tcp_btls=NULL; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t); opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256); /* if_include and if_exclude need to be mutually exclusive */ if (OPAL_SUCCESS != mca_base_var_check_exclusive("ompi", mca_btl_tcp_component.super.btl_version.mca_type_name, mca_btl_tcp_component.super.btl_version.mca_component_name, "if_include", mca_btl_tcp_component.super.btl_version.mca_type_name, mca_btl_tcp_component.super.btl_version.mca_component_name, "if_exclude")) { /* Return ERR_NOT_AVAILABLE so that a warning message about "open" failing is not printed */ return OMPI_ERR_NOT_AVAILABLE; } return OMPI_SUCCESS; }
int mca_pml_monitoring_add_procs(struct ompi_proc_t **procs, size_t nprocs) { /** * Create the monitoring hashtable only for my MPI_COMM_WORLD. We choose * to ignore by now all other processes. */ if(NULL == translation_ht) { size_t i; uint64_t key; opal_process_name_t tmp; nbprocs = nprocs; translation_ht = OBJ_NEW(opal_hash_table_t); opal_hash_table_init(translation_ht, 2048); for( i = 0; i < nprocs; i++ ) { /* rank : ompi_proc_local_proc in procs */ if( procs[i] == ompi_proc_local_proc) my_rank = i; /* Extract the peer procname from the procs array */ if( ompi_proc_is_sentinel(procs[i]) ) { tmp = ompi_proc_sentinel_to_name((uintptr_t)procs[i]); } else { tmp = procs[i]->super.proc_name; } key = *((uint64_t*)&tmp); /* store the rank (in COMM_WORLD) of the process with its name (a uniq opal ID) as key in the hash table*/ if( OPAL_SUCCESS != opal_hash_table_set_value_uint64(translation_ht, key, (void*)(uintptr_t)i) ) { return OMPI_ERR_OUT_OF_RESOURCE; /* failed to allocate memory or growing the hash table */ } } } return pml_selected_module.pml_add_procs(procs, nprocs); }
static int rte_init(void) { int ret; char *error = NULL; char *contact_path, *jobfam_dir; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; orte_app_context_t *app; char **aliases, *aptr; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /** setup callbacks for abort signals - from this point * forward, we need to abort in a manner that allows us * to cleanup. However, we cannot directly use libevent * to trap these signals as otherwise we cannot respond * to them if we are stuck in an event! So instead use * the basic POSIX trap functions to handle the signal, * and then let that signal handler do some magic to * avoid the hang * * NOTE: posix traps don't allow us to do anything major * in them, so use a pipe tied to a libevent event to * reach a "safe" place where the termination event can * be created */ pipe(term_pipe); /* setup an event to attempt normal termination on signal */ opal_event_set(orte_event_base, &term_handler, term_pipe[0], OPAL_EV_READ, clean_abort, NULL); opal_event_set_priority(&term_handler, ORTE_ERROR_PRI); opal_event_add(&term_handler, NULL); /* Set both ends of this pipe to be close-on-exec so that no children inherit it */ if (opal_fd_set_cloexec(term_pipe[0]) != OPAL_SUCCESS || opal_fd_set_cloexec(term_pipe[1]) != OPAL_SUCCESS) { error = "unable to set the pipe to CLOEXEC"; goto error; } /* point the signal trap to a function that will activate that event */ signal(SIGTERM, abort_signal_callback); signal(SIGINT, abort_signal_callback); signal(SIGHUP, abort_signal_callback); /** setup callbacks for signals we should foward */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback); setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback); setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback); signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { error = "topology discovery"; goto error; } } /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So * we remove that information here. */ obj = hwloc_get_root_obj(opal_hwloc_topology); for (i=0; i < obj->infos_count; i++) { if (NULL == obj->infos[i].name || NULL == obj->infos[i].value) { continue; } if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) { free(obj->infos[i].name); free(obj->infos[i].value); /* left justify the array */ for (j=i; j < obj->infos_count-1; j++) { obj->infos[j] = obj->infos[j+1]; } obj->infos[obj->infos_count-1].name = NULL; obj->infos[obj->infos_count-1].value = NULL; obj->infos_count--; break; } } if (4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } } #endif /* if we are using xml for output, put an mpirun start tag */ if (orte_xml_output) { fprintf(orte_xml_fp, "<mpirun>\n"); fflush(orte_xml_fp); } /* open and setup the opal_pstat framework so we can provide * process stats if requested */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* Since we are the HNP, then responsibility for * defining the name falls to the PLM component for our * respective environment - hence, we have to open the PLM * first and select that component. */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; goto error; } /* if we were spawned by a singleton, our jobid was given to us */ if (NULL != orte_ess_base_jobid) { if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) { ORTE_ERROR_LOG(ret); error = "convert_string_to_jobid"; goto error; } ORTE_PROC_MY_NAME->vpid = 0; } else { if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) { ORTE_ERROR_LOG(ret); error = "orte_plm_set_hnp_name"; goto error; } } /* Setup the communication infrastructure */ /* * OOB Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } /* * Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* init the nidmap - just so we register that verbosity */ orte_util_nidmap_init(NULL); /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* mark that the daemons have reported as we are the * only ones in the system right now, and we definitely * are running! */ jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED; /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); node->index = opal_pointer_array_set_item(orte_node_pool, 0, node); #if OPAL_HAVE_HWLOC /* add it to the array of known topologies */ opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology); #endif /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; OBJ_RETAIN(node); /* keep accounting straight */ proc->node = node; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the * node object */ OBJ_RETAIN(proc); /* keep accounting straight */ node->daemon = proc; ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED); node->state = ORTE_NODE_STATE_UP; /* if we are to retain aliases, get ours */ if (orte_retain_aliases) { aliases = NULL; opal_ifgetaliases(&aliases); /* add our own local name to it */ opal_argv_append_nosize(&aliases, orte_process_info.nodename); aptr = opal_argv_join(aliases, ','); opal_argv_free(aliases); orte_set_attribute(&node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, aptr, OPAL_STRING); free(aptr); } /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; /* * Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* datastore - ensure we don't pickup the pmi component, but * don't override anything set by user */ if (NULL == getenv("OMPI_MCA_dstore")) { putenv("OMPI_MCA_dstore=^pmi"); } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_select"; goto error; } /* create the handles */ if (0 > (opal_dstore_peer = opal_dstore.open("PEER"))) { error = "opal dstore global"; ret = ORTE_ERR_FATAL; goto error; } if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL"))) { error = "opal dstore internal"; ret = ORTE_ERR_FATAL; goto error; } if (0 > (opal_dstore_nonpeer = opal_dstore.open("NONPEER"))) { error = "opal dstore nonpeer"; ret = ORTE_ERR_FATAL; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup * as it may involve starting a non-blocking recv */ if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } /* * Setup the remaining resource * management and errmgr frameworks - application procs * and daemons do not open these frameworks as they only use * the hnp proxy support in the PLM framework. */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ras_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ras_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_find_available"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_find_available"; goto error; } #if OPAL_HAVE_HWLOC { char *coprocessors, **sns; uint32_t h; int idx; /* if a topology file was given, then the rmaps framework open * will have reset our topology. Ensure we always get the right * one by setting our node topology afterwards */ node->topology = opal_hwloc_topology; /* init the hash table, if necessary */ if (NULL == orte_coprocessors) { orte_coprocessors = OBJ_NEW(opal_hash_table_t); opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs); } /* detect and add any coprocessors */ coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology); if (NULL != coprocessors) { /* separate the serial numbers of the coprocessors * on this host */ sns = opal_argv_split(coprocessors, ','); for (idx=0; NULL != sns[idx]; idx++) { /* compute the hash */ OPAL_HASH_STR(sns[idx], h); /* mark that this coprocessor is hosted by this node */ opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&(ORTE_PROC_MY_NAME->vpid)); } opal_argv_free(sns); free(coprocessors); orte_coprocessors_detected = true; } /* see if I am on a coprocessor */ coprocessors = opal_hwloc_base_check_on_coprocessor(); if (NULL != coprocessors) { orte_set_attribute(&node->attributes, ORTE_NODE_SERIAL_NUMBER, ORTE_ATTR_LOCAL, coprocessors, OPAL_STRING); free(coprocessors); orte_coprocessors_detected = true; } } #endif /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* Open/select the rtc */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rtc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rtc_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rtc_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* we are an hnp, so update the contact info field for later use */ orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); proc->rml_uri = strdup(orte_process_info.my_hnp_uri); /* we are also officially a daemon, so better update that field too */ orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri); /* setup the orte_show_help system to recv remote output */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP, ORTE_RML_PERSISTENT, orte_show_help_recv, NULL); /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); /* take a pass thru the session directory code to fillin the * tmpdir names - don't create anything yet */ if (ORTE_SUCCESS != (ret = orte_session_dir(false, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir define"; goto error; } /* clear the session directory just in case there are * stale directories laying around */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* now actually create the directory tree */ if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output hnp file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* save my contact info in a file for others to find */ jobfam_dir = opal_dirname(orte_process_info.job_session_dir); contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL); free(jobfam_dir); OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_path)); if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file failed with error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); } else { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s wrote contact file", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } free(contact_path); } /* setup the routed info - the selected routed component * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_select"; goto error; } /* For HNP, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the dfs framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_select"; goto error; } /* if a tool has launched us and is requesting event reports, * then set its contact info into the comm system */ if (orte_report_events) { if (ORTE_SUCCESS != (ret = orte_util_comm_connect_tool(orte_report_events_uri))) { error = "could not connect to tool"; goto error; } } /* We actually do *not* want an HNP to voluntarily yield() the processor more than necessary. Orterun already blocks when it is doing nothing, so it doesn't use any more CPU cycles than it should; but when it *is* doing something, we do not want it to be unnecessarily delayed because it voluntarily yielded the processor in the middle of its work. For example: when a message arrives at orterun, we want the OS to wake us up in a timely fashion (which most OS's seem good about doing) and then we want orterun to process the message as fast as possible. If orterun yields and lets aggressive MPI applications get the processor back, it may be a long time before the OS schedules orterun to run again (particularly if there is no IO event to wake it up). Hence, routed OOB messages (for example) may be significantly delayed before being delivered to MPI processes, which can be problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ORTE_ERR_SILENT; }
int mca_btl_tcp_component_open(void) { char* message; #ifdef __WINDOWS__ WSADATA win_sock_data; if( WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0 ) { BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError())); return OMPI_ERROR; } #endif /* initialize state */ mca_btl_tcp_component.tcp_listen_sd = -1; #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_listen_sd = -1; #endif mca_btl_tcp_component.tcp_num_btls=0; mca_btl_tcp_component.tcp_addr_count = 0; mca_btl_tcp_component.tcp_btls=NULL; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t); opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256); /* register TCP component parameters */ mca_btl_tcp_component.tcp_num_links = mca_btl_tcp_param_register_int("links", NULL, 1); mca_btl_tcp_component.tcp_if_include = mca_btl_tcp_param_register_string("if_include", NULL, ""); mca_btl_tcp_component.tcp_if_exclude = mca_btl_tcp_param_register_string("if_exclude", NULL, "lo"); mca_btl_tcp_component.tcp_free_list_num = mca_btl_tcp_param_register_int ("free_list_num", NULL, 8); mca_btl_tcp_component.tcp_free_list_max = mca_btl_tcp_param_register_int ("free_list_max", NULL, -1); mca_btl_tcp_component.tcp_free_list_inc = mca_btl_tcp_param_register_int ("free_list_inc", NULL, 32); mca_btl_tcp_component.tcp_sndbuf = mca_btl_tcp_param_register_int ("sndbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_rcvbuf = mca_btl_tcp_param_register_int ("rcvbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_endpoint_cache = mca_btl_tcp_param_register_int ("endpoint_cache", "The size of the internal cache for each TCP connection. This cache is" " used to reduce the number of syscalls, by replacing them with memcpy." " Every read will read the expected data plus the amount of the" " endpoint_cache", 30*1024); mca_btl_tcp_component.tcp_use_nodelay = !mca_btl_tcp_param_register_int ("use_nagle", "Whether to use Nagle's algorithm or not (using Nagle's algorithm may increase short message latency)", 0); mca_btl_tcp_component.tcp_port_min = mca_btl_tcp_param_register_int( "port_min_v4", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v4", orte_process_info.nodename, mca_btl_tcp_component.tcp_port_min ); mca_btl_tcp_component.tcp_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1 ); mca_btl_tcp_component.tcp_port_range = mca_btl_tcp_param_register_int( "port_range_v4", message, (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1); free(message); #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_port_min = mca_btl_tcp_param_register_int( "port_min_v6", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp6_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v6", orte_process_info.nodename, mca_btl_tcp_component.tcp6_port_min ); mca_btl_tcp_component.tcp6_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1 ); mca_btl_tcp_component.tcp6_port_range = mca_btl_tcp_param_register_int( "port_range_v6", message, (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1); free(message); #endif mca_btl_tcp_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW + 100; mca_btl_tcp_module.super.btl_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_max_send_size = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version, &mca_btl_tcp_module.super); mca_btl_tcp_component.tcp_disable_family = mca_btl_tcp_param_register_int ("disable_family", NULL, 0); return OMPI_SUCCESS; }
int orte_ess_base_orted_setup(char **hosts) { int ret = ORTE_ERROR; int fd; char log_file[PATH_MAX]; char *jobidstring; char *error = NULL; orte_job_t *jdata; orte_proc_t *proc; orte_app_context_t *app; orte_node_t *node; char *param; hwloc_obj_t obj; unsigned i, j; opal_list_t transports; /* my name is set, xfer it to the OPAL layer */ orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME; orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename); orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL; orte_process_info.super.proc_arch = opal_local_arch; opal_proc_local_set(&orte_process_info.super); plm_in_use = false; /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); signals_set = true; /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "topology discovery"; goto error; } } /* generate the signature */ orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology); /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So * we remove that information here. */ obj = hwloc_get_root_obj(opal_hwloc_topology); for (i=0; i < obj->infos_count; i++) { if (NULL == obj->infos[i].name || NULL == obj->infos[i].value) { continue; } if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) { free(obj->infos[i].name); free(obj->infos[i].value); /* left justify the array */ for (j=i; j < obj->infos_count-1; j++) { obj->infos[j] = obj->infos[j+1]; } obj->infos[obj->infos_count-1].name = NULL; obj->infos[obj->infos_count-1].value = NULL; obj->infos_count--; break; } } if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } /* open and setup the opal_pstat framework so we can provide * process stats if requested */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_select"; goto error; } /* define the HNP name */ ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid; ORTE_PROC_MY_HNP->vpid = 0; /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* some environments allow remote launches - e.g., ssh - so * open and select something -only- if we are given * a specific module to use */ (void) mca_base_var_env_name("plm", ¶m); plm_in_use = !!(getenv(param)); free (param); if (plm_in_use) { if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; goto error; } } /* setup my session directory here as the OOB may need it */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); /* take a pass thru the session directory code to fillin the * tmpdir names - don't create anything yet */ if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir define"; goto error; } /* clear the session directory just in case there are * stale directories laying around */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* now actually create the directory tree */ if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* set the opal_output env file location to be in the * proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* setup stdout/stderr */ if (orte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ret); error = "convert_jobid"; goto error; } /* define a log file name in the session directory */ snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", jobidstring, orte_process_info.nodename); log_path = opal_os_path(false, orte_process_info.top_session_dir, log_file, NULL); fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so * just connect everything to /dev/null */ fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); } else { dup2(fd, STDOUT_FILENO); dup2(fd, STDERR_FILENO); if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { close(fd); } } } } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_hash_table_t); if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata); /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); /* point our topology to the one detected locally */ node->topology = opal_hwloc_topology; /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; proc->pid = orte_process_info.pid; proc->state = ORTE_PROC_STATE_RUNNING; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the * node object */ OBJ_RETAIN(proc); /* keep accounting straight */ node->daemon = proc; ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED); node->state = ORTE_NODE_STATE_UP; /* now point our proc node field to the node */ OBJ_RETAIN(node); /* keep accounting straight */ proc->node = node; /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; /* setup the PMIx framework - ensure it skips all non-PMIx components, * but do not override anything we were given */ opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ); if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_pmix_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_pmix_base_select"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* setup the PMIx server */ if (ORTE_SUCCESS != (ret = pmix_server_init())) { /* the server code already barked, so let's be quiet */ ret = ORTE_ERR_SILENT; error = "pmix_server_init"; goto error; } /* Setup the communication infrastructure */ /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } if (NULL != orte_process_info.my_hnp_uri) { /* extract the HNP's name so we can update the routing table */ if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_rml_parse_HNP"; goto error; } /* Set the contact info in the RML - this won't actually establish * the connection, but just tells the RML how to reach the HNP * if/when we attempt to send to it */ orte_rml.set_contact_info(orte_process_info.my_hnp_uri); } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* get a conduit for our use - we never route IO over fabric */ OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); orte_mgmt_conduit = orte_rml.open_conduit(&transports); OPAL_LIST_DESTRUCT(&transports); OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); orte_coll_conduit = orte_rml.open_conduit(&transports); OPAL_LIST_DESTRUCT(&transports); /* add our contact info to our proc object */ proc->rml_uri = orte_rml.get_contact_info(); /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* Open/select the rtc */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rtc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rtc_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rtc_base_select"; goto error; } /* be sure to update the routing tree so the initial "phone home" * to mpirun goes through the tree if static ports were enabled - still * need to do it anyway just to initialize things */ orte_routed.update_routing_plan(NULL); /* if we are using static ports, then we need to setup * the daemon info so the RML can function properly * without requiring a wireup stage. This must be done * after we enable_comm as that function determines our * own port, which we need in order to construct the nidmap */ if (orte_static_ports) { /* extract the node info from the environment and * build a nidmap from it - this will update the * routing plan as well */ if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) { ORTE_ERROR_LOG(ret); error = "construct daemon map from static ports"; goto error; } } /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup * as it may involve starting a non-blocking recv * Do this only if a specific PLM was given to us - the * orted has no need of the proxy PLM at all */ if (plm_in_use) { if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(!ORTE_PROC_IS_HNP, ORTE_PROC_IS_DAEMON))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_select"; goto error; } /* For daemons, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the DFS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_select"; goto error; } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); /* remove our use of the session directory tree */ orte_session_dir_finalize(ORTE_PROC_MY_NAME); /* ensure we scrub the session directory tree */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); return ORTE_ERR_SILENT; }
static void infosubscriber_construct(opal_infosubscriber_t *obj) { OBJ_CONSTRUCT(&obj->s_subscriber_table, opal_hash_table_t); opal_hash_table_init(&obj->s_subscriber_table, 10); }
static void jobdata_constructor(job_data_t *ptr) { ptr->jobid = ORTE_JOBID_INVALID; ptr->data = OBJ_NEW(opal_hash_table_t); opal_hash_table_init(ptr->data, 256); }
int mca_btl_tcp_component_open(void) { char* message; #ifdef __WINDOWS__ WSADATA win_sock_data; if( WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0 ) { BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError())); return OMPI_ERROR; } #endif /* initialize state */ mca_btl_tcp_component.tcp_listen_sd = -1; #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_listen_sd = -1; #endif mca_btl_tcp_component.tcp_num_btls=0; mca_btl_tcp_component.tcp_addr_count = 0; mca_btl_tcp_component.tcp_btls=NULL; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t); opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256); /* register TCP component parameters */ mca_btl_tcp_component.tcp_num_links = mca_btl_tcp_param_register_int("links", NULL, 1); mca_btl_tcp_component.tcp_if_include = mca_btl_tcp_param_register_string("if_include", "Comma-delimited list of devices or CIDR notation of networks to use for MPI communication (e.g., \"eth0,eth1\" or \"192.168.0.0/16,10.1.4.0/24\"). Mutually exclusive with btl_tcp_if_exclude.", ""); mca_btl_tcp_component.tcp_if_exclude = mca_btl_tcp_param_register_string("if_exclude", "Comma-delimited list of devices or CIDR notation of networks to NOT use for MPI communication -- all devices not matching these specifications will be used (e.g., \"eth0,eth1\" or \"192.168.0.0/16,10.1.4.0/24\"). Mutually exclusive with btl_tcp_if_include.", "lo,sppp"); mca_btl_tcp_component.tcp_free_list_num = mca_btl_tcp_param_register_int ("free_list_num", NULL, 8); mca_btl_tcp_component.tcp_free_list_max = mca_btl_tcp_param_register_int ("free_list_max", NULL, -1); mca_btl_tcp_component.tcp_free_list_inc = mca_btl_tcp_param_register_int ("free_list_inc", NULL, 32); mca_btl_tcp_component.tcp_sndbuf = mca_btl_tcp_param_register_int ("sndbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_rcvbuf = mca_btl_tcp_param_register_int ("rcvbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_endpoint_cache = mca_btl_tcp_param_register_int ("endpoint_cache", "The size of the internal cache for each TCP connection. This cache is" " used to reduce the number of syscalls, by replacing them with memcpy." " Every read will read the expected data plus the amount of the" " endpoint_cache", 30*1024); mca_btl_tcp_component.tcp_use_nodelay = !mca_btl_tcp_param_register_int ("use_nagle", "Whether to use Nagle's algorithm or not (using Nagle's algorithm may increase short message latency)", 0); mca_btl_tcp_component.tcp_port_min = mca_btl_tcp_param_register_int( "port_min_v4", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v4", orte_process_info.nodename, mca_btl_tcp_component.tcp_port_min ); mca_btl_tcp_component.tcp_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1 ); mca_btl_tcp_component.tcp_port_range = mca_btl_tcp_param_register_int( "port_range_v4", message, (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1); free(message); #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_port_min = mca_btl_tcp_param_register_int( "port_min_v6", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp6_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v6", orte_process_info.nodename, mca_btl_tcp_component.tcp6_port_min ); mca_btl_tcp_component.tcp6_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1 ); mca_btl_tcp_component.tcp6_port_range = mca_btl_tcp_param_register_int( "port_range_v6", message, (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1); free(message); #endif mca_btl_tcp_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW + 100; mca_btl_tcp_module.super.btl_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_max_send_size = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version, &mca_btl_tcp_module.super); mca_btl_tcp_component.tcp_disable_family = mca_btl_tcp_param_register_int ("disable_family", NULL, 0); /* Register a list of interfaces to use in sequence */ message = mca_btl_tcp_param_register_string("if_seq", "If specified, a comma-delimited list of TCP interfaces. Interfaces will be assigned, one to each MPI process, in a round-robin fashion on each server. For example, if the list is \"eth0,eth1\" and four MPI processes are run on a single server, then local ranks 0 and 2 will use eth0 and local ranks 1 and 3 will use eth1.", NULL); mca_btl_tcp_component.tcp_if_seq = NULL; if (NULL != message && '\0' != *message) { char **argv = opal_argv_split(message, ','); if (NULL != argv && '\0' != *(argv[0])) { int if_index, rc, count; orte_node_rank_t node_rank; char name[256]; node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); /* Now that we've got that local rank, take the corresponding entry from the tcp_if_seq list (wrapping if necessary) */ count = opal_argv_count(argv); mca_btl_tcp_component.tcp_if_seq = strdup(argv[node_rank % count]); opal_argv_free(argv); /* Double check that the selected interface actually exists */ for (if_index = opal_ifbegin(); if_index >= 0; if_index = opal_ifnext(if_index)){ if (OPAL_SUCCESS != (rc = opal_ifindextoname(if_index, name, sizeof(name)))) { return rc; } if (0 == strcmp(name, mca_btl_tcp_component.tcp_if_seq)) { break; } } if (if_index < 0) { orte_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude", true, "if_seq", orte_process_info.nodename, mca_btl_tcp_component.tcp_if_seq, "Interface does not exist"); return OMPI_ERR_BAD_PARAM; } BTL_VERBOSE(("Node rank %d using TCP interface %s", node_rank, mca_btl_tcp_component.tcp_if_seq)); } } return OMPI_SUCCESS; }
int ompi_mpi_init(int argc, char **argv, int requested, int *provided) { int ret; ompi_proc_t** procs; size_t nprocs; char *error = NULL; struct timeval ompistart, ompistop; bool rte_setup = false; ompi_rte_collective_t *coll; char *cmd=NULL, *av=NULL; /* bitflag of the thread level support provided. To be used * for the modex in order to work in heterogeneous environments. */ uint8_t threadlevel_bf; /* Indicate that we have *started* MPI_INIT*. MPI_FINALIZE has something sorta similar in a static local variable in ompi_mpi_finalize(). */ ompi_mpi_init_started = true; /* Setup enough to check get/set MCA params */ if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) { error = "ompi_mpi_init: opal_init_util failed"; goto error; } /* Register MCA variables */ if (OPAL_SUCCESS != (ret = ompi_register_mca_variables())) { error = "ompi_mpi_init: ompi_register_mca_variables failed"; goto error; } if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) { error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed"; goto error; } /* _After_ opal_init_util() but _before_ orte_init(), we need to set an MCA param that tells libevent that it's ok to use any mechanism in libevent that is available on this platform (e.g., epoll and friends). Per opal/event/event.s, we default to select/poll -- but we know that MPI processes won't be using pty's with the event engine, so it's ok to relax this constraint and let any fd-monitoring mechanism be used. */ ret = mca_base_var_find("opal", "event", "*", "event_include"); if (ret >= 0) { char *allvalue = "all"; /* We have to explicitly "set" the MCA param value here because libevent initialization will re-register the MCA param and therefore override the default. Setting the value here puts the desired value ("all") in different storage that is not overwritten if/when the MCA param is re-registered. This is unless the user has specified a different value for this MCA parameter. Make sure we check to see if the default is specified before forcing "all" in case that is not what the user desires. Note that we do *NOT* set this value as an environment variable, just so that it won't be inherited by any spawned processes and potentially cause unintented side-effects with launching RTE tools... */ mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL); } if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistart, NULL); } /* if we were not externally started, then we need to setup * some envars so the MPI_INFO_ENV can get the cmd name * and argv (but only if the user supplied a non-NULL argv!), and * the requested thread level */ if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) { asprintf(&cmd, "OMPI_COMMAND=%s", argv[0]); putenv(cmd); } if (NULL == getenv("OMPI_ARGV") && 1 < argc) { char *tmp; tmp = opal_argv_join(&argv[1], ' '); asprintf(&av, "OMPI_ARGV=%s", tmp); free(tmp); putenv(av); } /* open the rte framework */ if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_rte_base_framework, 0))) { error = "ompi_rte_base_open() failed"; goto error; } /* no select is required as this is a static framework */ /* Setup RTE - note that we are an MPI process */ if (OMPI_SUCCESS != (ret = ompi_rte_init(NULL, NULL))) { error = "ompi_mpi_init: ompi_rte_init failed"; goto error; } rte_setup = true; /* check for timing request - get stop time and report elapsed time if so */ if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of rte_init %ld usec", (long)OMPI_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } #if OPAL_HAVE_HWLOC /* if hwloc is available but didn't get setup for some * reason, do so now */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "Topology init"; goto error; } } #endif /* Register the default errhandler callback - RTE will ignore if it * doesn't support this capability */ ompi_rte_register_errhandler(ompi_errhandler_runtime_callback, OMPI_RTE_ERRHANDLER_LAST); /* Figure out the final MPI thread levels. If we were not compiled for support for MPI threads, then don't allow MPI_THREAD_MULTIPLE. Set this stuff up here early in the process so that other components can make decisions based on this value. */ ompi_mpi_thread_level(requested, provided); /* determine the bitflag belonging to the threadlevel_support provided */ memset ( &threadlevel_bf, 0, sizeof(uint8_t)); OMPI_THREADLEVEL_SET_BITFLAG ( ompi_mpi_thread_provided, threadlevel_bf ); /* add this bitflag to the modex */ if ( OMPI_SUCCESS != (ret = ompi_modex_send_string("MPI_THREAD_LEVEL", &threadlevel_bf, sizeof(uint8_t)))) { error = "ompi_mpi_init: modex send thread level"; goto error; } /* initialize datatypes. This step should be done early as it will * create the local convertor and local arch used in the proc * init. */ if (OMPI_SUCCESS != (ret = ompi_datatype_init())) { error = "ompi_datatype_init() failed"; goto error; } /* Initialize OMPI procs */ if (OMPI_SUCCESS != (ret = ompi_proc_init())) { error = "mca_proc_init() failed"; goto error; } /* Initialize the op framework. This has to be done *after* ddt_init, but befor mca_coll_base_open, since some collective modules (e.g., the hierarchical coll component) may need ops in their query function. */ if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_op_base_framework, 0))) { error = "ompi_op_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_op_base_find_available(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_THREAD_MULTIPLE))) { error = "ompi_op_base_find_available() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_op_init())) { error = "ompi_op_init() failed"; goto error; } /* Open up MPI-related MCA components */ if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_allocator_base_framework, 0))) { error = "mca_allocator_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_rcache_base_framework, 0))) { error = "mca_rcache_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_mpool_base_framework, 0))) { error = "mca_mpool_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_bml_base_framework, 0))) { error = "mca_bml_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_pml_base_framework, 0))) { error = "mca_pml_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_coll_base_framework, 0))) { error = "mca_coll_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_osc_base_framework, 0))) { error = "ompi_osc_base_open() failed"; goto error; } #if OPAL_ENABLE_FT_CR == 1 if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_crcp_base_framework, 0))) { error = "ompi_crcp_base_open() failed"; goto error; } #endif /* In order to reduce the common case for MPI apps (where they don't use MPI-2 IO or MPI-1 topology functions), the io and topo frameworks are initialized lazily, at the first use of relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*), so they are not opened here. */ /* Select which MPI components to use */ if (OMPI_SUCCESS != (ret = mca_mpool_base_init(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_THREAD_MULTIPLE))) { error = "mca_mpool_base_init() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_pml_base_select(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_THREAD_MULTIPLE))) { error = "mca_pml_base_select() failed"; goto error; } /* check for timing request - get stop time and report elapsed time if so */ if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time from completion of rte_init to modex %ld usec", (long)OMPI_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } /* exchange connection info - this function also acts as a barrier * as it will not return until the exchange is complete */ coll = OBJ_NEW(ompi_rte_collective_t); coll->id = ompi_process_info.peer_modex; coll->active = true; if (OMPI_SUCCESS != (ret = ompi_rte_modex(coll))) { error = "rte_modex failed"; goto error; } /* wait for modex to complete - this may be moved anywhere in mpi_init * so long as it occurs prior to calling a function that needs * the modex info! */ OMPI_WAIT_FOR_COMPLETION(coll->active); OBJ_RELEASE(coll); if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec", (long)OMPI_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } /* select buffered send allocator component to be used */ if( OMPI_SUCCESS != (ret = mca_pml_base_bsend_init(OMPI_ENABLE_THREAD_MULTIPLE))) { error = "mca_pml_base_bsend_init() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_coll_base_find_available(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_THREAD_MULTIPLE))) { error = "mca_coll_base_find_available() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_THREAD_MULTIPLE))) { error = "ompi_osc_base_find_available() failed"; goto error; } #if OPAL_ENABLE_FT_CR == 1 if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) { error = "ompi_crcp_base_select() failed"; goto error; } #endif /* io and topo components are not selected here -- see comment above about the io and topo frameworks being loaded lazily */ /* Initialize each MPI handle subsystem */ /* initialize requests */ if (OMPI_SUCCESS != (ret = ompi_request_init())) { error = "ompi_request_init() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_message_init())) { error = "ompi_message_init() failed"; goto error; } /* initialize info */ if (OMPI_SUCCESS != (ret = ompi_info_init())) { error = "ompi_info_init() failed"; goto error; } /* initialize error handlers */ if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) { error = "ompi_errhandler_init() failed"; goto error; } /* initialize error codes */ if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) { error = "ompi_mpi_errcode_init() failed"; goto error; } /* initialize internal error codes */ if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) { error = "ompi_errcode_intern_init() failed"; goto error; } /* initialize groups */ if (OMPI_SUCCESS != (ret = ompi_group_init())) { error = "ompi_group_init() failed"; goto error; } /* initialize communicators */ if (OMPI_SUCCESS != (ret = ompi_comm_init())) { error = "ompi_comm_init() failed"; goto error; } /* initialize file handles */ if (OMPI_SUCCESS != (ret = ompi_file_init())) { error = "ompi_file_init() failed"; goto error; } /* initialize windows */ if (OMPI_SUCCESS != (ret = ompi_win_init())) { error = "ompi_win_init() failed"; goto error; } /* initialize attribute meta-data structure for comm/win/dtype */ if (OMPI_SUCCESS != (ret = ompi_attr_init())) { error = "ompi_attr_init() failed"; goto error; } /* identify the architectures of remote procs and setup * their datatype convertors, if required */ if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) { error = "ompi_proc_complete_init failed"; goto error; } /* If thread support was enabled, then setup OPAL to allow for them. */ if ((OMPI_ENABLE_PROGRESS_THREADS == 1) || (*provided != MPI_THREAD_SINGLE)) { opal_set_using_threads(true); } /* start PML/BTL's */ ret = MCA_PML_CALL(enable(true)); if( OMPI_SUCCESS != ret ) { error = "PML control failed"; goto error; } /* add all ompi_proc_t's to PML */ if (NULL == (procs = ompi_proc_world(&nprocs))) { error = "ompi_proc_world() failed"; goto error; } ret = MCA_PML_CALL(add_procs(procs, nprocs)); free(procs); /* If we got "unreachable", then print a specific error message. Otherwise, if we got some other failure, fall through to print a generic message. */ if (OMPI_ERR_UNREACH == ret) { opal_show_help("help-mpi-runtime", "mpi_init:startup:pml-add-procs-fail", true); error = NULL; goto error; } else if (OMPI_SUCCESS != ret) { error = "PML add procs failed"; goto error; } MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm)); MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm)); /* * Dump all MCA parameters if requested */ if (ompi_mpi_show_mca_params) { ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, nprocs, ompi_process_info.nodename); } /* Do we need to wait for a debugger? */ ompi_rte_wait_for_debugger(); /* check for timing request - get stop time and report elapsed time if so, then start the clock again */ if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time from modex to first barrier %ld usec", (long)OMPI_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } /* wait for everyone to reach this point */ coll = OBJ_NEW(ompi_rte_collective_t); coll->id = ompi_process_info.peer_init_barrier; coll->active = true; if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) { error = "rte_barrier failed"; goto error; } /* wait for barrier to complete */ OMPI_WAIT_FOR_COMPLETION(coll->active); OBJ_RELEASE(coll); /* check for timing request - get stop time and report elapsed time if so, then start the clock again */ if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec", (long)OMPI_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } #if OMPI_ENABLE_PROGRESS_THREADS == 0 /* Start setting up the event engine for MPI operations. Don't block in the event library, so that communications don't take forever between procs in the dynamic code. This will increase CPU utilization for the remainder of MPI_INIT when we are blocking on RTE-level events, but may greatly reduce non-TCP latency. */ opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK); #endif /* wire up the mpi interface, if requested. Do this after the non-block switch for non-TCP performance. Do before the polling change as anyone with a complex wire-up is going to be using the oob. */ if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) { error = "ompi_mpi_do_preconnect_all() failed"; goto error; } /* Setup the publish/subscribe (PUBSUB) framework */ if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_pubsub_base_framework, 0))) { error = "mca_pubsub_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) { error = "ompi_pubsub_base_select() failed"; goto error; } /* Setup the dynamic process management (DPM) framework */ if (OMPI_SUCCESS != (ret = mca_base_framework_open(&ompi_dpm_base_framework, 0))) { error = "ompi_dpm_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) { error = "ompi_dpm_base_select() failed"; goto error; } /* Determine the overall threadlevel support of all processes in MPI_COMM_WORLD. This has to be done before calling coll_base_comm_select, since some of the collective components e.g. hierarch, might create subcommunicators. The threadlevel requested by all processes is required in order to know which cid allocation algorithm can be used. */ if ( OMPI_SUCCESS != ( ret = ompi_comm_cid_init ())) { error = "ompi_mpi_init: ompi_comm_cid_init failed"; goto error; } /* Init coll for the comms. This has to be after dpm_base_select, (since dpm.mark_dyncomm is not set in the communicator creation function else), but before dpm.dyncom_init, since this function might require collective for the CID allocation. */ if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) { error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) { error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed"; goto error; } /* Check whether we have been spawned or not. We introduce that at the very end, since we need collectives, datatypes, ptls etc. up and running here.... */ if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) { error = "ompi_comm_dyn_init() failed"; goto error; } /* * Startup the Checkpoint/Restart Mech. * Note: Always do this so tools don't hang when * in a non-checkpointable build */ if (OMPI_SUCCESS != (ret = ompi_cr_init())) { error = "ompi_cr_init"; goto error; } /* Undo OPAL calling opal_progress_event_users_increment() during opal_init, to get better latency when not using TCP. Do this *after* dyn_init, as dyn init uses lots of RTE communication and we don't want to hinder the performance of that code. */ opal_progress_event_users_decrement(); /* see if yield_when_idle was specified - if so, use it */ opal_progress_set_yield_when_idle(ompi_mpi_yield_when_idle); /* negative value means use default - just don't do anything */ if (ompi_mpi_event_tick_rate >= 0) { opal_progress_set_event_poll_rate(ompi_mpi_event_tick_rate); } /* At this point, we are fully configured and in MPI mode. Any communication calls here will work exactly like they would in the user's code. Setup the connections between procs and warm them up with simple sends, if requested */ if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) { error = "ompi_mpiext_init"; goto error; } /* Fall through */ error: if (ret != OMPI_SUCCESS) { /* Only print a message if one was not already printed */ if (NULL != error) { const char *err_msg = opal_strerror(ret); /* If RTE was not setup yet, don't use opal_show_help */ if (rte_setup) { opal_show_help("help-mpi-runtime", "mpi_init:startup:internal-failure", true, "MPI_INIT", "MPI_INIT", error, err_msg, ret); } else { opal_show_help("help-mpi-runtime", "mpi_init:startup:internal-failure", true, "MPI_INIT", "MPI_INIT", error, err_msg, ret); } } return ret; } /* Initialize the registered datarep list to be empty */ OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t); /* Initialize the arrays used to store the F90 types returned by the * MPI_Type_create_f90_XXX functions. */ OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */); OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP); OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP); /* All done. Wasn't that simple? */ ompi_mpi_initialized = true; /* check for timing request - get stop time and report elapsed time if so */ if (ompi_enable_timing && 0 == OMPI_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time from barrier to complete mpi_init %ld usec", (long)OMPI_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); } return MPI_SUCCESS; }
int ompi_mpi_init(int argc, char **argv, int requested, int *provided) { int ret; ompi_proc_t** procs; size_t nprocs; char *error = NULL; bool timing = false; int param, value; struct timeval ompistart, ompistop; char *event_val = NULL; opal_paffinity_base_cpu_set_t mask; bool proc_bound; #if 0 /* see comment below about sched_yield */ int num_processors; #endif bool orte_setup = false; bool paffinity_enabled = false; /* Setup enough to check get/set MCA params */ if (ORTE_SUCCESS != (ret = opal_init_util())) { error = "ompi_mpi_init: opal_init_util failed"; goto error; } /* _After_ opal_init_util() but _before_ orte_init(), we need to set an MCA param that tells libevent that it's ok to use any mechanism in libevent that is available on this platform (e.g., epoll and friends). Per opal/event/event.s, we default to select/poll -- but we know that MPI processes won't be using pty's with the event engine, so it's ok to relax this constraint and let any fd-monitoring mechanism be used. */ ret = mca_base_param_reg_string_name("opal", "event_include", "Internal orted MCA param: tell opal_init() to use a specific mechanism in libevent", false, false, "all", &event_val); if (ret >= 0) { /* We have to explicitly "set" the MCA param value here because libevent initialization will re-register the MCA param and therefore override the default. Setting the value here puts the desired value ("all") in different storage that is not overwritten if/when the MCA param is re-registered. This is unless the user has specified a different value for this MCA parameter. Make sure we check to see if the default is specified before forcing "all" in case that is not what the user desires. Note that we do *NOT* set this value as an environment variable, just so that it won't be inherited by any spawned processes and potentially cause unintented side-effects with launching ORTE tools... */ if (0 == strcmp("all", event_val)) { mca_base_param_set_string(ret, "all"); } } if( NULL != event_val ) { free(event_val); event_val = NULL; } /* check to see if we want timing information */ param = mca_base_param_reg_int_name("ompi", "timing", "Request that critical timing loops be measured", false, false, 0, &value); if (value != 0) { timing = true; gettimeofday(&ompistart, NULL); } /* Setup ORTE - note that we are not a tool */ if (ORTE_SUCCESS != (ret = orte_init(ORTE_NON_TOOL))) { error = "ompi_mpi_init: orte_init failed"; goto error; } orte_setup = true; /* check for timing request - get stop time and report elapsed time if so */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of orte_init %ld usec", (long)ORTE_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } /* Figure out the final MPI thread levels. If we were not compiled for support for MPI threads, then don't allow MPI_THREAD_MULTIPLE. Set this stuff up here early in the process so that other components can make decisions based on this value. */ ompi_mpi_thread_requested = requested; if (OMPI_HAVE_THREAD_SUPPORT == 0) { ompi_mpi_thread_provided = *provided = MPI_THREAD_SINGLE; ompi_mpi_main_thread = NULL; } else if (OMPI_ENABLE_MPI_THREADS == 1) { ompi_mpi_thread_provided = *provided = requested; ompi_mpi_main_thread = opal_thread_get_self(); } else { if (MPI_THREAD_MULTIPLE == requested) { ompi_mpi_thread_provided = *provided = MPI_THREAD_SERIALIZED; } else { ompi_mpi_thread_provided = *provided = requested; } ompi_mpi_main_thread = opal_thread_get_self(); } ompi_mpi_thread_multiple = (ompi_mpi_thread_provided == MPI_THREAD_MULTIPLE); /* Once we've joined the RTE, see if any MCA parameters were passed to the MPI level */ if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) { error = "mca_mpi_register_params() failed"; goto error; } /* if it hasn't already been done, setup process affinity. * First check to see if a slot list was * specified. If so, use it. If no slot list was specified, * that's not an error -- just fall through and try the next * paffinity scheme. */ ret = opal_paffinity_base_get(&mask); if (OPAL_SUCCESS == ret) { /* paffinity is supported - check for binding */ OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound); if (proc_bound) { /* someone external set it - indicate it is set * so that we know */ paffinity_enabled = true; } else { /* the system is capable of doing processor affinity, but it * has not yet been set - see if a slot_list was given */ if (NULL != opal_paffinity_base_slot_list) { /* It's an error if multiple paffinity schemes were specified */ if (opal_paffinity_alone) { ret = OMPI_ERR_BAD_PARAM; error = "Multiple processor affinity schemes specified (can only specify one)"; goto error; } ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list); if (OPAL_ERR_NOT_FOUND != ret) { error = "opal_paffinity_base_slot_list_set() returned an error"; goto error; } paffinity_enabled = true; } else if (opal_paffinity_alone) { /* no slot_list, but they asked for paffinity */ int phys_cpu; orte_node_rank_t nrank; if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) { error = "Could not get node rank - cannot set processor affinity"; goto error; } OPAL_PAFFINITY_CPU_ZERO(mask); phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank); if (0 > phys_cpu) { error = "Could not get physical processor id - cannot set processor affinity"; goto error; } OPAL_PAFFINITY_CPU_SET(phys_cpu, mask); ret = opal_paffinity_base_set(mask); if (OPAL_SUCCESS != ret) { error = "Setting processor affinity failed"; goto error; } paffinity_enabled = true; } } } /* If we were able to set processor affinity, try setting up memory affinity */ if (!opal_maffinity_setup && paffinity_enabled) { if (OPAL_SUCCESS == opal_maffinity_base_open() && OPAL_SUCCESS == opal_maffinity_base_select()) { opal_maffinity_setup = true; } } /* initialize datatypes. This step should be done early as it will * create the local convertor and local arch used in the proc * init. */ if (OMPI_SUCCESS != (ret = ompi_ddt_init())) { error = "ompi_ddt_init() failed"; goto error; } /* Initialize OMPI procs */ if (OMPI_SUCCESS != (ret = ompi_proc_init())) { error = "mca_proc_init() failed"; goto error; } /* initialize ops. This has to be done *after* ddt_init, but befor mca_coll_base_open, since come collective modules (e.g. the hierarchical) need them in the query function */ if (OMPI_SUCCESS != (ret = ompi_op_init())) { error = "ompi_op_init() failed"; goto error; } /* Open up MPI-related MCA components */ if (OMPI_SUCCESS != (ret = mca_allocator_base_open())) { error = "mca_allocator_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_rcache_base_open())) { error = "mca_rcache_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_mpool_base_open())) { error = "mca_mpool_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_pml_base_open())) { error = "mca_pml_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_coll_base_open())) { error = "mca_coll_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_osc_base_open())) { error = "ompi_osc_base_open() failed"; goto error; } #if OPAL_ENABLE_FT == 1 if (OMPI_SUCCESS != (ret = ompi_crcp_base_open())) { error = "ompi_crcp_base_open() failed"; goto error; } #endif /* In order to reduce the common case for MPI apps (where they don't use MPI-2 IO or MPI-1 topology functions), the io and topo frameworks are initialized lazily, at the first use of relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*), so they are not opened here. */ /* Select which MPI components to use */ if (OMPI_SUCCESS != (ret = mca_mpool_base_init(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_MPI_THREADS))) { error = "mca_mpool_base_init() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_pml_base_select(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_MPI_THREADS))) { error = "mca_pml_base_select() failed"; goto error; } /* select buffered send allocator component to be used */ ret=mca_pml_base_bsend_init(OMPI_ENABLE_MPI_THREADS); if( OMPI_SUCCESS != ret ) { error = "mca_pml_base_bsend_init() failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_coll_base_find_available(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_MPI_THREADS))) { error = "mca_coll_base_find_available() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS, OMPI_ENABLE_MPI_THREADS))) { error = "ompi_osc_base_find_available() failed"; goto error; } #if OPAL_ENABLE_FT == 1 if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) { error = "ompi_crcp_base_select() failed"; goto error; } #endif /* io and topo components are not selected here -- see comment above about the io and topo frameworks being loaded lazily */ /* Initialize each MPI handle subsystem */ /* initialize requests */ if (OMPI_SUCCESS != (ret = ompi_request_init())) { error = "ompi_request_init() failed"; goto error; } /* initialize info */ if (OMPI_SUCCESS != (ret = ompi_info_init())) { error = "ompi_info_init() failed"; goto error; } /* initialize error handlers */ if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) { error = "ompi_errhandler_init() failed"; goto error; } /* initialize error codes */ if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) { error = "ompi_mpi_errcode_init() failed"; goto error; } /* initialize internal error codes */ if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) { error = "ompi_errcode_intern_init() failed"; goto error; } /* initialize groups */ if (OMPI_SUCCESS != (ret = ompi_group_init())) { error = "ompi_group_init() failed"; goto error; } /* initialize communicators */ if (OMPI_SUCCESS != (ret = ompi_comm_init())) { error = "ompi_comm_init() failed"; goto error; } /* initialize file handles */ if (OMPI_SUCCESS != (ret = ompi_file_init())) { error = "ompi_file_init() failed"; goto error; } /* initialize windows */ if (OMPI_SUCCESS != (ret = ompi_win_init())) { error = "ompi_win_init() failed"; goto error; } /* initialize attribute meta-data structure for comm/win/dtype */ if (OMPI_SUCCESS != (ret = ompi_attr_init())) { error = "ompi_attr_init() failed"; goto error; } /* check for timing request - get stop time and report elapsed time if so */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time from completion of orte_init to modex %ld usec", (long)ORTE_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } /* exchange connection info - this function also acts as a barrier * as it will not return until the exchange is complete */ if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) { error = "orte_grpcomm_modex failed"; goto error; } if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec", (long)ORTE_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } /* identify the architectures of remote procs and setup * their datatype convertors, if required */ if (OMPI_SUCCESS != (ret = ompi_proc_set_arch())) { error = "ompi_proc_set_arch failed"; goto error; } /* If thread support was enabled, then setup OPAL to allow for them. */ if ((OMPI_ENABLE_PROGRESS_THREADS == 1) || (*provided != MPI_THREAD_SINGLE)) { opal_set_using_threads(true); } /* start PML/BTL's */ ret = MCA_PML_CALL(enable(true)); if( OMPI_SUCCESS != ret ) { error = "PML control failed"; goto error; } /* add all ompi_proc_t's to PML */ if (NULL == (procs = ompi_proc_world(&nprocs))) { error = "ompi_proc_world() failed"; goto error; } ret = MCA_PML_CALL(add_procs(procs, nprocs)); free(procs); if( OMPI_SUCCESS != ret ) { error = "PML add procs failed"; goto error; } MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm)); MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm)); /* * Dump all MCA parameters if requested */ if (ompi_mpi_show_mca_params) { ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, nprocs, orte_process_info.nodename); } /* Do we need to wait for a debugger? */ ompi_wait_for_debugger(); /* check for timing request - get stop time and report elapsed time if so, then start the clock again */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time from modex thru complete oob wireup %ld usec", (long)ORTE_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } /* wait for everyone to reach this point */ if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) { error = "orte_grpcomm_barrier failed"; goto error; } /* check for timing request - get stop time and report elapsed time if so, then start the clock again */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec", (long)ORTE_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); gettimeofday(&ompistart, NULL); } #if OMPI_ENABLE_PROGRESS_THREADS == 0 /* Start setting up the event engine for MPI operations. Don't block in the event library, so that communications don't take forever between procs in the dynamic code. This will increase CPU utilization for the remainder of MPI_INIT when we are blocking on ORTE-level events, but may greatly reduce non-TCP latency. */ opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK); #endif /* wire up the mpi interface, if requested. Do this after the non-block switch for non-TCP performance. Do before the polling change as anyone with a complex wire-up is going to be using the oob. */ if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) { error = "ompi_mpi_do_preconnect_all() failed"; goto error; } /* Setup the publish/subscribe (PUBSUB) framework */ if (OMPI_SUCCESS != (ret = ompi_pubsub_base_open())) { error = "ompi_pubsub_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) { error = "ompi_pubsub_base_select() failed"; goto error; } /* Setup the dynamic process management (DPM) framework */ if (OMPI_SUCCESS != (ret = ompi_dpm_base_open())) { error = "ompi_dpm_base_open() failed"; goto error; } if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) { error = "ompi_dpm_base_select() failed"; goto error; } /* Init coll for the comms. This has to be after dpm_base_select, (since dpm.mark_dyncomm is not set in the communicator creation function else), but before dpm.dyncom_init, since this function might require collective for the CID allocation. */ if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) { error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed"; goto error; } if (OMPI_SUCCESS != (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) { error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed"; goto error; } /* Check whether we have been spawned or not. We introduce that at the very end, since we need collectives, datatypes, ptls etc. up and running here.... */ if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) { error = "ompi_comm_dyn_init() failed"; goto error; } /* * Startup the Checkpoint/Restart Mech. * Note: Always do this so tools don't hang when * in a non-checkpointable build */ if (OMPI_SUCCESS != (ret = ompi_cr_init())) { error = "ompi_cr_init"; goto error; } /* Undo OPAL calling opal_progress_event_users_increment() during opal_init, to get better latency when not using TCP. Do this *after* dyn_init, as dyn init uses lots of ORTE communication and we don't want to hinder the performance of that code. */ opal_progress_event_users_decrement(); /* see if yield_when_idle was specified - if so, use it */ param = mca_base_param_find("mpi", NULL, "yield_when_idle"); mca_base_param_lookup_int(param, &value); if (value < 0) { /* if no info is provided, just default to conservative */ opal_progress_set_yield_when_idle(true); } else { /* info was provided, so set idle accordingly */ opal_progress_set_yield_when_idle(value == 0 ? false : true); } param = mca_base_param_find("mpi", NULL, "event_tick_rate"); mca_base_param_lookup_int(param, &value); /* negative value means use default - just don't do anything */ if (value >= 0) { opal_progress_set_event_poll_rate(value); } /* At this point, we are fully configured and in MPI mode. Any communication calls here will work exactly like they would in the user's code. Setup the connections between procs and warm them up with simple sends, if requested */ error: if (ret != OMPI_SUCCESS) { const char *err_msg = opal_strerror(ret); /* If ORTE was not setup yet, don't use orte_show_help */ if (orte_setup) { orte_show_help("help-mpi-runtime", "mpi_init:startup:internal-failure", true, "MPI_INIT", "MPI_INIT", error, err_msg, ret); } else { opal_show_help("help-mpi-runtime", "mpi_init:startup:internal-failure", true, "MPI_INIT", "MPI_INIT", error, err_msg, ret); } return ret; } /* Initialize the registered datarep list to be empty */ OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t); /* Initialize the arrays used to store the F90 types returned by the * MPI_Type_create_f90_XXX functions. */ OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */); OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP); OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t); opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP); /* All done. Wasn't that simple? */ ompi_mpi_initialized = true; /* check for timing request - get stop time and report elapsed time if so */ if (timing && 0 == ORTE_PROC_MY_NAME->vpid) { gettimeofday(&ompistop, NULL); opal_output(0, "ompi_mpi_init[%ld]: time from barrier p to complete mpi_init %ld usec", (long)ORTE_PROC_MY_NAME->vpid, (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 + (ompistop.tv_usec - ompistart.tv_usec))); } return MPI_SUCCESS; }
static int init(void) { OBJ_CONSTRUCT(&hash_data, opal_hash_table_t); opal_hash_table_init(&hash_data, 256); return OPAL_SUCCESS; }
static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor, int *model) { ompi_osc_ucx_module_t *module = NULL; char *name = NULL; long values[2]; int ret = OMPI_SUCCESS; ucs_status_t status; int i, comm_size = ompi_comm_size(comm); int is_eps_ready; bool eps_created = false, worker_created = false; ucp_address_t *my_addr = NULL; size_t my_addr_len; char *recv_buf = NULL; void *rkey_buffer = NULL, *state_rkey_buffer = NULL; size_t rkey_buffer_size, state_rkey_buffer_size; void *state_base = NULL; void * my_info = NULL; size_t my_info_len; int disps[comm_size]; int rkey_sizes[comm_size]; /* the osc/sm component is the exclusive provider for support for * shared memory windows */ if (flavor == MPI_WIN_FLAVOR_SHARED) { return OMPI_ERR_NOT_SUPPORTED; } /* if UCP worker has never been initialized before, init it first */ if (mca_osc_ucx_component.ucp_worker == NULL) { ucp_worker_params_t worker_params; ucp_worker_attr_t worker_attr; memset(&worker_params, 0, sizeof(ucp_worker_h)); worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; worker_params.thread_mode = (mca_osc_ucx_component.enable_mpi_threads == true) ? UCS_THREAD_MODE_MULTI : UCS_THREAD_MODE_SINGLE; status = ucp_worker_create(mca_osc_ucx_component.ucp_context, &worker_params, &(mca_osc_ucx_component.ucp_worker)); if (UCS_OK != status) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_create failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } /* query UCP worker attributes */ worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE; status = ucp_worker_query(mca_osc_ucx_component.ucp_worker, &worker_attr); if (UCS_OK != status) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_query failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } if (mca_osc_ucx_component.enable_mpi_threads == true && worker_attr.thread_mode != UCS_THREAD_MODE_MULTI) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucx does not support multithreading\n", __FILE__, __LINE__); ret = OMPI_ERROR; goto error; } worker_created = true; } /* create module structure */ module = (ompi_osc_ucx_module_t *)calloc(1, sizeof(ompi_osc_ucx_module_t)); if (module == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } /* fill in the function pointer part */ memcpy(module, &ompi_osc_ucx_module_template, sizeof(ompi_osc_base_module_t)); ret = ompi_comm_dup(comm, &module->comm); if (ret != OMPI_SUCCESS) { goto error; } asprintf(&name, "ucx window %d", ompi_comm_get_cid(module->comm)); ompi_win_set_name(win, name); free(name); /* share everyone's displacement units. Only do an allgather if strictly necessary, since it requires O(p) state. */ values[0] = disp_unit; values[1] = -disp_unit; ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, values, 2, MPI_LONG, MPI_MIN, module->comm, module->comm->c_coll->coll_allreduce_module); if (OMPI_SUCCESS != ret) { goto error; } if (values[0] == -values[1]) { /* everyone has the same disp_unit, we do not need O(p) space */ module->disp_unit = disp_unit; } else { /* different disp_unit sizes, allocate O(p) space to store them */ module->disp_unit = -1; module->disp_units = calloc(comm_size, sizeof(int)); if (module->disp_units == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } ret = module->comm->c_coll->coll_allgather(&disp_unit, 1, MPI_INT, module->disp_units, 1, MPI_INT, module->comm, module->comm->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { goto error; } } /* exchange endpoints if necessary */ is_eps_ready = 1; for (i = 0; i < comm_size; i++) { if (OSC_UCX_GET_EP(module->comm, i) == NULL) { is_eps_ready = 0; break; } } ret = module->comm->c_coll->coll_allreduce(MPI_IN_PLACE, &is_eps_ready, 1, MPI_INT, MPI_LAND, module->comm, module->comm->c_coll->coll_allreduce_module); if (OMPI_SUCCESS != ret) { goto error; } if (!is_eps_ready) { status = ucp_worker_get_address(mca_osc_ucx_component.ucp_worker, &my_addr, &my_addr_len); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_worker_get_address failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } ret = allgather_len_and_info(my_addr, (int)my_addr_len, &recv_buf, disps, module->comm); if (ret != OMPI_SUCCESS) { goto error; } for (i = 0; i < comm_size; i++) { if (OSC_UCX_GET_EP(module->comm, i) == NULL) { ucp_ep_params_t ep_params; ucp_ep_h ep; memset(&ep_params, 0, sizeof(ucp_ep_params_t)); ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = (ucp_address_t *)&(recv_buf[disps[i]]); status = ucp_ep_create(mca_osc_ucx_component.ucp_worker, &ep_params, &ep); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_create failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } ompi_comm_peer_lookup(module->comm, i)->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_UCX] = ep; } } ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr); my_addr = NULL; free(recv_buf); recv_buf = NULL; eps_created = true; } ret = mem_map(base, size, &(module->memh), module, flavor); if (ret != OMPI_SUCCESS) { goto error; } state_base = (void *)&(module->state); ret = mem_map(&state_base, sizeof(ompi_osc_ucx_state_t), &(module->state_memh), module, MPI_WIN_FLAVOR_CREATE); if (ret != OMPI_SUCCESS) { goto error; } module->win_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t)); if (module->win_info_array == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } module->state_info_array = calloc(comm_size, sizeof(ompi_osc_ucx_win_info_t)); if (module->state_info_array == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->memh, &rkey_buffer, &rkey_buffer_size); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_rkey_pack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } status = ucp_rkey_pack(mca_osc_ucx_component.ucp_context, module->state_memh, &state_rkey_buffer, &state_rkey_buffer_size); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_rkey_pack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } my_info_len = 2 * sizeof(uint64_t) + rkey_buffer_size + state_rkey_buffer_size; my_info = malloc(my_info_len); if (my_info == NULL) { ret = OMPI_ERR_TEMP_OUT_OF_RESOURCE; goto error; } memcpy(my_info, base, sizeof(uint64_t)); memcpy((void *)((char *)my_info + sizeof(uint64_t)), &state_base, sizeof(uint64_t)); memcpy((void *)((char *)my_info + 2 * sizeof(uint64_t)), rkey_buffer, rkey_buffer_size); memcpy((void *)((char *)my_info + 2 * sizeof(uint64_t) + rkey_buffer_size), state_rkey_buffer, state_rkey_buffer_size); ret = allgather_len_and_info(my_info, (int)my_info_len, &recv_buf, disps, module->comm); if (ret != OMPI_SUCCESS) { goto error; } ret = comm->c_coll->coll_allgather((void *)&rkey_buffer_size, 1, MPI_INT, rkey_sizes, 1, MPI_INT, comm, comm->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { goto error; } for (i = 0; i < comm_size; i++) { ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i); assert(ep != NULL); memcpy(&(module->win_info_array[i]).addr, &recv_buf[disps[i]], sizeof(uint64_t)); memcpy(&(module->state_info_array[i]).addr, &recv_buf[disps[i] + sizeof(uint64_t)], sizeof(uint64_t)); status = ucp_ep_rkey_unpack(ep, &(recv_buf[disps[i] + 2 * sizeof(uint64_t)]), &((module->win_info_array[i]).rkey)); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_rkey_unpack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } status = ucp_ep_rkey_unpack(ep, &(recv_buf[disps[i] + 2 * sizeof(uint64_t) + rkey_sizes[i]]), &((module->state_info_array[i]).rkey)); if (status != UCS_OK) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: ucp_ep_rkey_unpack failed: %d\n", __FILE__, __LINE__, status); ret = OMPI_ERROR; goto error; } } free(my_info); free(recv_buf); ucp_rkey_buffer_release(rkey_buffer); ucp_rkey_buffer_release(state_rkey_buffer); module->state.lock = TARGET_LOCK_UNLOCKED; module->state.post_index = 0; memset((void *)module->state.post_state, 0, sizeof(uint64_t) * OMPI_OSC_UCX_POST_PEER_MAX); module->state.complete_count = 0; module->state.req_flag = 0; module->state.acc_lock = TARGET_LOCK_UNLOCKED; module->epoch_type.access = NONE_EPOCH; module->epoch_type.exposure = NONE_EPOCH; module->lock_count = 0; module->post_count = 0; module->start_group = NULL; module->post_group = NULL; OBJ_CONSTRUCT(&module->outstanding_locks, opal_hash_table_t); OBJ_CONSTRUCT(&module->pending_posts, opal_list_t); module->global_ops_num = 0; module->per_target_ops_nums = calloc(comm_size, sizeof(int)); module->start_grp_ranks = NULL; module->lock_all_is_nocheck = false; ret = opal_hash_table_init(&module->outstanding_locks, comm_size); if (ret != OPAL_SUCCESS) { goto error; } win->w_osc_module = &module->super; /* sync with everyone */ ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module); if (ret != OMPI_SUCCESS) { goto error; } return ret; error: if (my_addr) ucp_worker_release_address(mca_osc_ucx_component.ucp_worker, my_addr); if (recv_buf) free(recv_buf); if (my_info) free(my_info); for (i = 0; i < comm_size; i++) { if ((module->win_info_array[i]).rkey != NULL) { ucp_rkey_destroy((module->win_info_array[i]).rkey); } if ((module->state_info_array[i]).rkey != NULL) { ucp_rkey_destroy((module->state_info_array[i]).rkey); } } if (rkey_buffer) ucp_rkey_buffer_release(rkey_buffer); if (state_rkey_buffer) ucp_rkey_buffer_release(state_rkey_buffer); if (module->win_info_array) free(module->win_info_array); if (module->state_info_array) free(module->state_info_array); if (module->disp_units) free(module->disp_units); if (module->comm) ompi_comm_free(&module->comm); if (module->per_target_ops_nums) free(module->per_target_ops_nums); if (eps_created) { for (i = 0; i < comm_size; i++) { ucp_ep_h ep = OSC_UCX_GET_EP(module->comm, i); ucp_ep_destroy(ep); } } if (worker_created) ucp_worker_destroy(mca_osc_ucx_component.ucp_worker); if (module) free(module); return ret; }