struct mca_sharedfp_base_module_1_0_0_t * mca_sharedfp_sm_component_file_query(mca_io_ompio_file_t *fh, int *priority) { int i; ompi_proc_t *proc; ompi_communicator_t * comm = fh->f_comm; int size = ompi_comm_size(comm); *priority = 0; /* test, and update priority. All processes have to be ** on a single node. ** original test copied from mca/coll/sm/coll_sm_module.c: */ ompi_group_t *group = comm->c_local_group; for (i = 0; i < size; ++i) { proc = ompi_group_peer_lookup(group,i); if (!OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)){ opal_output(1,"mca_sharedfp_sm_component_file_query: Disqualifying myself: (%d/%s) " "not all processes are on the same node.", comm->c_contextid, comm->c_name); return NULL; } } /* This module can run */ *priority = mca_sharedfp_sm_priority; return &sm; }
static void mca_coll_hierarch_checkfor_sm ( struct ompi_communicator_t *comm, int *color, int *ncount ) { int i, size; int lncount=0; struct ompi_proc_t** procs=NULL; struct ompi_proc_t* my_proc=NULL; *color = -1; size = ompi_comm_size(comm); my_proc = ompi_proc_local(); procs = comm->c_local_group->grp_proc_pointers; for ( i = 0 ; i < size ; i++) { if ( OMPI_CAST_RTE_NAME(&procs[i]->super.proc_name)->jobid == OMPI_CAST_RTE_NAME(&my_proc->super.proc_name)->jobid && ( OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) ) { lncount++; if ( *color == -1){ *color = i; } } } /* we need to decrease ncount in order to make the other allreduce/allgather operations work */ lncount--; *ncount = lncount; return; }
static int vader_add_procs (struct mca_btl_base_module_t* btl, size_t nprocs, struct ompi_proc_t **procs, struct mca_btl_base_endpoint_t **peers, opal_bitmap_t *reachability) { mca_btl_vader_component_t *component = &mca_btl_vader_component; mca_btl_vader_t *vader_btl = (mca_btl_vader_t *) btl; int32_t proc, local_rank; ompi_proc_t *my_proc; int rc; /* initializion */ /* get pointer to my proc structure */ if (NULL == (my_proc = ompi_proc_local())) { return OMPI_ERR_OUT_OF_RESOURCE; } /* jump out if there's not someone we can talk to */ if (1 > MCA_BTL_VADER_NUM_LOCAL_PEERS) { return OMPI_SUCCESS; } /* make sure that my local rank has been defined */ if (ORTE_LOCAL_RANK_INVALID == MCA_BTL_VADER_LOCAL_RANK) { return OMPI_ERROR; } if (!vader_btl->btl_inited) { rc = vader_btl_first_time_init (vader_btl, 1 + MCA_BTL_VADER_NUM_LOCAL_PEERS); if (rc != OMPI_SUCCESS) { return rc; } } for (proc = 0, local_rank = 0 ; proc < (int32_t) nprocs ; ++proc) { /* check to see if this proc can be reached via shmem (i.e., if they're on my local host and in my job) */ if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid || !OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) { peers[proc] = NULL; continue; } if (my_proc != procs[proc]) { /* add this proc to shared memory accessibility list */ rc = opal_bitmap_set_bit (reachability, proc); if(OMPI_SUCCESS != rc) { return rc; } } /* setup endpoint */ peers[proc] = component->endpoints + local_rank; init_vader_endpoint (peers[proc], procs[proc], local_rank++); } return OMPI_SUCCESS; }
static int _get_local_ranks(mca_scoll_fca_module_t *fca_module) { struct oshmem_group_t *comm = fca_module->comm; oshmem_proc_t* proc; int i, rank; /* Count the local ranks */ fca_module->num_local_procs = 0; for (rank = 0; rank < comm->proc_count; ++rank) { proc = comm->proc_array[rank]; if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { if (proc->super.proc_name.vpid == (uint32_t) fca_module->rank) { fca_module->local_proc_idx = fca_module->num_local_procs; } ++fca_module->num_local_procs; } } /* Make a list of local ranks */ fca_module->local_ranks = calloc(fca_module->num_local_procs, sizeof *fca_module->local_ranks); if (!fca_module->local_ranks) { FCA_ERROR("Failed to allocate memory for %d local ranks", fca_module->num_local_procs); return OSHMEM_ERROR; } i = 0; for (rank = 0; rank < comm->proc_count; ++rank) { proc = comm->proc_array[rank]; if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { fca_module->local_ranks[i++] = rank; } } FCA_MODULE_VERBOSE(fca_module, 3, "i am %d/%d", fca_module->local_proc_idx, fca_module->num_local_procs); return OSHMEM_SUCCESS; }
/** * Fills local rank information in fca_module. */ static int __get_local_ranks(mca_coll_fca_module_t *fca_module) { ompi_communicator_t *comm = fca_module->comm; ompi_proc_t* proc; int i, rank; /* Count the local ranks */ fca_module->num_local_procs = 0; for (rank = 0; rank < ompi_comm_size(comm); ++rank) { proc = __local_rank_lookup(comm, rank); if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { if (rank == fca_module->rank) { fca_module->local_proc_idx = fca_module->num_local_procs; } ++fca_module->num_local_procs; } } /* Make a list of local ranks */ fca_module->local_ranks = calloc(fca_module->num_local_procs, sizeof *fca_module->local_ranks); if (!fca_module->local_ranks) { FCA_ERROR("Failed to allocate memory for %d local ranks", fca_module->num_local_procs); return OMPI_ERROR; } i = 0; for (rank = 0; rank < ompi_comm_size(comm); ++rank) { proc = __local_rank_lookup(comm, rank); if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { fca_module->local_ranks[i++] = rank; } } FCA_MODULE_VERBOSE(fca_module, 3, "i am %d/%d", fca_module->local_proc_idx, fca_module->num_local_procs); return OMPI_SUCCESS; }
static bool have_local_peers(ompi_group_t *group, size_t size) { size_t i; ompi_proc_t *proc; for (i = 0; i < size; ++i) { proc = ompi_group_peer_lookup(group,i); if (!OPAL_PROC_ON_LOCAL_NODE(proc->proc_flags)) { return false; } } return true; }
static void unpack_remote_mkeys(opal_buffer_t *msg, int remote_pe) { int32_t cnt; int32_t n; int32_t tr_id; int i; oshmem_proc_t *proc; proc = oshmem_proc_group_find(oshmem_group_all, remote_pe); cnt = 1; opal_dss.unpack(msg, &n, &cnt, OPAL_UINT32); for (i = 0; i < n; i++) { cnt = 1; opal_dss.unpack(msg, &tr_id, &cnt, OPAL_UINT32); cnt = 1; opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].va_base, &cnt, OPAL_UINT64); if (0 == memheap_oob.mkeys[tr_id].va_base) { cnt = 1; opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].u.key, &cnt, OPAL_UINT64); if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { memheap_attach_segment(&memheap_oob.mkeys[tr_id], tr_id); } } else { cnt = 1; opal_dss.unpack(msg, &memheap_oob.mkeys[tr_id].len, &cnt, OPAL_UINT16); if (0 < memheap_oob.mkeys[tr_id].len) { memheap_oob.mkeys[tr_id].u.data = malloc(memheap_oob.mkeys[tr_id].len); if (NULL == memheap_oob.mkeys[tr_id].u.data) { MEMHEAP_ERROR("Failed allocate %d bytes", memheap_oob.mkeys[tr_id].len); oshmem_shmem_abort(-1); } cnt = memheap_oob.mkeys[tr_id].len; opal_dss.unpack(msg, memheap_oob.mkeys[tr_id].u.data, &cnt, OPAL_BYTE); MCA_SPML_CALL(rmkey_unpack(&memheap_oob.mkeys[tr_id], remote_pe)); } else { memheap_oob.mkeys[tr_id].u.key = MAP_SEGMENT_SHM_INVALID; } } MEMHEAP_VERBOSE(5, "tr_id: %d %s", tr_id, mca_spml_base_mkey2str(&memheap_oob.mkeys[tr_id])); } }
/* * Initial query function that is invoked during MPI_INIT, allowing * this component to disqualify itself if it doesn't support the * required level of thread support. This function is invoked exactly * once. */ int mca_coll_sm_init_query(bool enable_progress_threads, bool enable_mpi_threads) { ompi_proc_t *my_proc, **procs; size_t i, size; /* See if there are other procs in my job on this node. If not, then don't bother going any further. */ if (NULL == (my_proc = ompi_proc_local()) || NULL == (procs = ompi_proc_all(&size))) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:init_query: weirdness on procs; disqualifying myself"); return OMPI_ERR_OUT_OF_RESOURCE; } /* if no session directory was created, then we cannot be used */ if (NULL == ompi_process_info.job_session_dir) { return OMPI_ERR_OUT_OF_RESOURCE; } if (size <= 1) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:init_query: comm size too small; disqualifying myself"); free(procs); return OMPI_ERR_NOT_AVAILABLE; } for (i = 0; i < size; ++i) { if (procs[i] != my_proc && procs[i]->proc_name.jobid == my_proc->proc_name.jobid && OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) { break; } } free(procs); if (i >= size) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:init_query: no other local procs; disqualifying myself"); return OMPI_ERR_NOT_AVAILABLE; } /* Don't do much here because we don't really want to allocate any shared memory until this component is selected to be used. */ opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:init_query: pick me! pick me!"); return OMPI_SUCCESS; }
static int have_remote_peers(struct oshmem_group_t *group, size_t size, int *local_peers) { struct oshmem_proc_t *proc; size_t i; int ret; *local_peers = 0; ret = 0; for (i = 0; i < size; ++i) { proc = group->proc_array[i]; if (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { ++*local_peers; } else { ret = 1; } } return ret; }
bool ompi_group_have_remote_peers (ompi_group_t *group) { for (int i = 0 ; i < group->grp_proc_count ; ++i) { ompi_proc_t *proc = NULL; #if OMPI_GROUP_SPARSE proc = ompi_group_peer_lookup (group, i); #else proc = ompi_group_get_proc_ptr_raw (group, i); if (ompi_proc_is_sentinel (proc)) { /* the proc must be stored in the group or cached in the proc * hash table if the process resides in the local node * (see ompi_proc_complete_init) */ return true; } #endif if (!OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)) { return true; } } return false; }
char* opal_hwloc_base_print_locality(opal_hwloc_locality_t locality) { opal_hwloc_print_buffers_t *ptr; int idx; ptr = opal_hwloc_get_print_buffer(); if (NULL == ptr) { return opal_hwloc_print_null; } /* cycle around the ring */ if (OPAL_HWLOC_PRINT_NUM_BUFS == ptr->cntr) { ptr->cntr = 0; } idx = 0; if (OPAL_PROC_ON_LOCAL_CLUSTER(locality)) { ptr->buffers[ptr->cntr][idx++] = 'C'; ptr->buffers[ptr->cntr][idx++] = 'L'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_CU(locality)) { ptr->buffers[ptr->cntr][idx++] = 'C'; ptr->buffers[ptr->cntr][idx++] = 'U'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_NODE(locality)) { ptr->buffers[ptr->cntr][idx++] = 'N'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_BOARD(locality)) { ptr->buffers[ptr->cntr][idx++] = 'B'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_NUMA(locality)) { ptr->buffers[ptr->cntr][idx++] = 'N'; ptr->buffers[ptr->cntr][idx++] = 'u'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_SOCKET(locality)) { ptr->buffers[ptr->cntr][idx++] = 'S'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_L3CACHE(locality)) { ptr->buffers[ptr->cntr][idx++] = 'L'; ptr->buffers[ptr->cntr][idx++] = '3'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_L2CACHE(locality)) { ptr->buffers[ptr->cntr][idx++] = 'L'; ptr->buffers[ptr->cntr][idx++] = '2'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_L1CACHE(locality)) { ptr->buffers[ptr->cntr][idx++] = 'L'; ptr->buffers[ptr->cntr][idx++] = '1'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_CORE(locality)) { ptr->buffers[ptr->cntr][idx++] = 'C'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (OPAL_PROC_ON_LOCAL_HWTHREAD(locality)) { ptr->buffers[ptr->cntr][idx++] = 'H'; ptr->buffers[ptr->cntr][idx++] = 'w'; ptr->buffers[ptr->cntr][idx++] = 't'; ptr->buffers[ptr->cntr][idx++] = ':'; } if (0 < idx) { ptr->buffers[ptr->cntr][idx-1] = '\0'; } else if (OPAL_PROC_NON_LOCAL & locality) { ptr->buffers[ptr->cntr][idx++] = 'N'; ptr->buffers[ptr->cntr][idx++] = 'O'; ptr->buffers[ptr->cntr][idx++] = 'N'; ptr->buffers[ptr->cntr][idx++] = '\0'; } else { /* must be an unknown locality */ ptr->buffers[ptr->cntr][idx++] = 'U'; ptr->buffers[ptr->cntr][idx++] = 'N'; ptr->buffers[ptr->cntr][idx++] = 'K'; ptr->buffers[ptr->cntr][idx++] = '\0'; } return ptr->buffers[ptr->cntr]; }
/* This routine is used to find the list of procs that run on the ** same host as the calling process. */ static mca_sbgp_base_module_t *mca_sbgp_basesmuma_select_procs(struct ompi_proc_t ** procs, int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data ) { /* local variables */ int cnt,proc,local,last_local_proc; mca_sbgp_basesmuma_module_t *module; module=OBJ_NEW(mca_sbgp_basesmuma_module_t); if (!module ) { return NULL; } module->super.group_size=0; module->super.group_comm = comm; module->super.group_list = NULL; module->super.group_net = OMPI_SBGP_MUMA; for (proc = 0, cnt = 0, last_local_proc = 0 ; proc < n_procs_in ; ++proc) { local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->super.proc_flags); if (local) { last_local_proc = proc; cnt++; } } /* if no other local procs found skip to end */ if( 2 > cnt ) { /* There's always at least one - namely myself */ assert(1 == cnt); module->super.group_size = 1; module->super.group_list = (int *) malloc (sizeof (int)); module->super.group_list[0] = last_local_proc; /* let ml handle this case */ goto OneLocalPeer; } /* generate list of local ranks */ module->super.group_size=cnt; if( cnt > 0 ) { module->super.group_list=(int *)malloc(sizeof(int)*cnt); if(NULL == module->super.group_list){ goto Error; } } for (proc = 0, cnt = 0 ; proc < n_procs_in ; ++proc) { local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->super.proc_flags); if( local ) { module->super.group_list[cnt++] = proc; } } OneLocalPeer: /* successful completion */ return (mca_sbgp_base_module_t *)module; /* return with error */ Error: /* clean up */ if( NULL != module->super.group_list ) { free(module->super.group_list); module->super.group_list=NULL; } OBJ_RELEASE(module); return NULL; }
int ompi_mtl_mxm_module_init(void) { #if MXM_API < MXM_VERSION(2,0) ompi_mtl_mxm_ep_conn_info_t ep_info; #endif void *ep_address; size_t ep_address_len; mxm_error_t err; uint32_t jobid; uint64_t mxlr; ompi_proc_t **procs; unsigned ptl_bitmap; size_t totps, proc; int lr, nlps; int rc; mxlr = 0; lr = -1; jobid = ompi_mtl_mxm_get_job_id(); if (0 == jobid) { MXM_ERROR("Failed to generate jobid"); return OMPI_ERROR; } if (NULL == (procs = ompi_proc_world(&totps))) { MXM_ERROR("Unable to obtain process list"); return OMPI_ERROR; } if (totps < (size_t)ompi_mtl_mxm.mxm_np) { MXM_VERBOSE(1, "MXM support will be disabled because of total number " "of processes (%lu) is less than the minimum set by the " "mtl_mxm_np MCA parameter (%u)", totps, ompi_mtl_mxm.mxm_np); return OMPI_ERR_NOT_SUPPORTED; } MXM_VERBOSE(1, "MXM support enabled"); if (ORTE_NODE_RANK_INVALID == (lr = ompi_process_info.my_node_rank)) { MXM_ERROR("Unable to obtain local node rank"); return OMPI_ERROR; } nlps = ompi_process_info.num_local_peers + 1; for (proc = 0; proc < totps; proc++) { if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) { mxlr = max(mxlr, procs[proc]->proc_name.vpid); } } /* Setup the endpoint options and local addresses to bind to. */ #if MXM_API < MXM_VERSION(2,0) ptl_bitmap = ompi_mtl_mxm.mxm_ctx_opts->ptl_bitmap; #else ptl_bitmap = 0; #endif /* Open MXM endpoint */ err = ompi_mtl_mxm_create_ep(ompi_mtl_mxm.mxm_context, &ompi_mtl_mxm.ep, ptl_bitmap, lr, jobid, mxlr, nlps); if (MXM_OK != err) { opal_show_help("help-mtl-mxm.txt", "unable to create endpoint", true, mxm_error_string(err)); return OMPI_ERROR; } /* * Get address for each PTL on this endpoint, and share it with other ranks. */ #if MXM_API < MXM_VERSION(2,0) if ((ptl_bitmap & MXM_BIT(MXM_PTL_SELF)) && OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SELF)) { return OMPI_ERROR; } if ((ptl_bitmap & MXM_BIT(MXM_PTL_RDMA)) && OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_RDMA)) { return OMPI_ERROR; } if ((ptl_bitmap & MXM_BIT(MXM_PTL_SHM)) && OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SHM)) { return OMPI_ERROR; } ep_address = &ep_info; ep_address_len = sizeof(ep_info); #else rc = ompi_mtl_mxm_get_ep_address(&ep_address, &ep_address_len); if (OMPI_SUCCESS != rc) { return rc; } #endif rc = ompi_mtl_mxm_send_ep_address(ep_address, ep_address_len); if (OMPI_SUCCESS != rc) { MXM_ERROR("Modex session failed."); return rc; } #if MXM_API >= MXM_VERSION(2,0) free(ep_address); #endif /* Register the MXM progress function */ opal_progress_register(ompi_mtl_mxm_progress); #if MXM_API >= MXM_VERSION(2,0) if (ompi_mtl_mxm.using_mem_hooks) { opal_mem_hooks_register_release(ompi_mtl_mxm_mem_release_cb, NULL); } #endif return OMPI_SUCCESS; }
static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_proc_t ** procs, int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data ) { /* local variables */ mca_sbgp_basesmsocket_module_t *module; int ret; int my_socket_index; int proc, cnt, local, n_local_peers, my_rank; ompi_proc_t* my_proc; int *local_ranks_in_comm=NULL; int *socket_info=NULL, my_socket_info; int i_cnt, lp_cnt, my_local_index = -1, comm_size=ompi_comm_size(comm); /* initialize data */ output_data=NULL; my_rank=ompi_comm_rank(comm); my_proc=ompi_comm_peer_lookup(comm,my_rank); /*create a new module*/ module=OBJ_NEW(mca_sbgp_basesmsocket_module_t); if (!module ) { return NULL; } module->super.group_size=0; module->super.group_comm = comm; module->super.group_list = NULL; module->super.group_net = OMPI_SBGP_SOCKET; /* test to see if process is bound */ if( OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) ) { /* pa affinity not set, so socket index will be set to -1 */ my_socket_index=-1; /*debug print*/ /* */ BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank)); /*end debug*/ goto NoLocalPeers; } else { my_socket_index=-1; /* this should find my logical socket id which is the socket id we want * physical socket ids are not necessarily unique, logical ones, as defined * by the hwloc API are unique. */ if( OMPI_SUCCESS != mca_sbgp_map_to_logical_socket_id(&my_socket_index)) { BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group !!!\n",my_rank)); goto NoLocalPeers; } } /* Debug prints */ /* { fprintf(stderr,"Number of processors per node: %d\n",num_processors); fprintf(stderr,"I am rank %d and my socket index is %d\n and my core index is %d\n",my_rank,my_socket_index,core_index); fprintf(stderr,"n_proc_in = %d\n",n_procs_in); fprintf(stderr,"\n"); fflush(stderr); } end debug prints */ /*get my socket index*/ cnt=0; for( proc=0 ; proc < n_procs_in ; proc++) { local=OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if( local ) { cnt++; } } /*debug print */ /* fprintf(stderr,"Number of local processors %d\n",cnt); end debug print*/ /* if no other local procs found skip to end */ if( 1 >= cnt ) { goto NoLocalPeers; } /* allocate structure to hold the list of local ranks */ local_ranks_in_comm=(int *)malloc(sizeof(int)*cnt); if(NULL == local_ranks_in_comm ) { goto Error; } /* figure out which ranks from the input communicator - comm - will * particiapte in the local socket determination. */ n_local_peers=0; i_cnt=0; for( proc = 0; proc < n_procs_in; proc++) { local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if ( local ) { /* set the rank within the on-host ranks - this will be used for tha * allgather */ if( my_proc == procs[proc] ) { my_local_index=n_local_peers; } /* find the rank of the current proc in comm. We take advantage * of the fact that ranks in a group have the same relative * ordering as they do within the communicator. */ for( lp_cnt=proc; lp_cnt < comm_size ; lp_cnt++ ) { if(procs[proc] == ompi_comm_peer_lookup(comm,lp_cnt) ) { local_ranks_in_comm[i_cnt]=lp_cnt; /* lp_cnt has alrady been checked */ i_cnt++; /* found the corresponding rank in comm, so don't need * to search any more */ break; } /*i_cnt++;*/ /*fprintf(stderr,"QQQ i_cnt %d \n",i_cnt);*/ } n_local_peers++; } } /*fprintf(stderr,"YYY n_local_peers %d\n",n_local_peers);*/ socket_info=(int *)malloc(sizeof(int)*n_local_peers); /*fprintf(stderr,"XXX got socket info\n");*/ if(NULL == socket_info ) { goto Error; } my_socket_info=my_socket_index; /* Allgather data over the communicator */ ret=comm_allgather_pml(&my_socket_info, socket_info, 1, MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm); if (OMPI_SUCCESS != ret ) { BASESMSOCKET_VERBOSE(10, ("comm_allgather_pml returned error %d\n",ret)); return NULL; } /*allocate memory to the group_list probably an overestimation of the necessary resources */ module->super.group_list=(int *)malloc(sizeof(int)*cnt); if(NULL == module->super.group_list) { goto Error; } /* figure out who is sharing the same socket */ cnt=0; for (proc = 0; proc < n_local_peers; proc++) { int rem_rank=local_ranks_in_comm[proc]; int rem_socket_index=socket_info[proc]; /*Populate the list*/ if (rem_socket_index == my_socket_index) { module->super.group_list[cnt]=rem_rank; cnt++; } } module->super.group_size=cnt; #if 0 /*debug print*/ { int ii; fprintf(stderr,"Ranks per socket: %d\n",cnt); fprintf(stderr,"Socket %d owns ranks: ", my_socket_index); for (ii=0; ii < cnt; ii++) fprintf(stderr,"%d ",module->super.group_list[ii]); fprintf(stderr,"\n"); fflush(stderr); } #endif /* end debug*/ /*Free resources*/ free(local_ranks_in_comm); free(socket_info); /*Return the module*/ return (mca_sbgp_base_module_t *) module; NoLocalPeers: /* nothing to store, so just free the module and return */ /*fprintf(stderr,"No local socket peers\n");*/ /*free(module);*/ if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; Error: /*clean up*/ if( NULL != module->super.group_list) { free(module->super.group_list); module->super.group_list=NULL; } if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; }
static mca_mtl_base_module_t* ompi_mtl_psm_component_init(bool enable_progress_threads, bool enable_mpi_threads) { psm_error_t err; int rc; int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; ompi_proc_t *my_proc, **procs; size_t num_total_procs, proc; int local_rank = -1, num_local_procs = 0; /* Compute the total number of processes on this host and our local rank * on that node. We need to provide PSM with these values so it can * allocate hardware contexts appropriately across processes. */ if ((rc = ompi_proc_refresh()) != OMPI_SUCCESS) { return NULL; } my_proc = ompi_proc_local(); if (NULL == (procs = ompi_proc_world(&num_total_procs))) { return NULL; } for (proc = 0; proc < num_total_procs; proc++) { if (my_proc == procs[proc]) { local_rank = num_local_procs++; continue; } if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) { num_local_procs++; } } assert(local_rank >= 0 && num_local_procs > 0); free(procs); err = psm_error_register_handler(NULL /* no ep */, PSM_ERRHANDLER_NOP); if (err) { opal_output(0, "Error in psm_error_register_handler (error %s)\n", psm_error_get_string(err)); return NULL; } #if PSM_VERNO >= 0x010c /* Set infinipath debug level */ err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, (const void*) &ompi_mtl_psm.debug_level, sizeof(unsigned)); if (err) { /* Non fatal error. Can continue */ orte_show_help("help-mtl-psm.txt", "psm init", false, psm_error_get_string(err)); } #endif /* Only allow for shm and ipath devices in 2.0 and earlier releases * (unless the user overrides the setting). */ if (PSM_VERNO >= 0x0104) { setenv("PSM_DEVICES", "self,shm,ipath", 0); } else { setenv("PSM_DEVICES", "shm,ipath", 0); } err = psm_init(&verno_major, &verno_minor); if (err) { orte_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return NULL; } /* Complete PSM initialization */ ompi_mtl_psm_module_init(local_rank, num_local_procs); ompi_mtl_psm.super.mtl_request_size = sizeof(mca_mtl_psm_request_t) - sizeof(struct mca_mtl_request_t); return &ompi_mtl_psm.super; }
static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_proc_t ** procs, int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data ) { /* local variables */ mca_sbgp_basesmsocket_module_t *module; /* opal_buffer_t* sbuffer = OBJ_NEW(opal_buffer_t); opal_buffer_t* rbuffer = OBJ_NEW(opal_buffer_t); */ opal_paffinity_base_cpu_set_t my_cpu_set; bool bound; int ret; int num_processors; int socket_tmp; int my_socket_index; int core_index=-1; int proc, cnt, local, n_local_peers, my_index, my_rank; ompi_proc_t* my_proc; int *local_ranks_in_comm=NULL; int *socket_info=NULL, my_socket_info; int i_cnt, lp_cnt, my_local_index, comm_size=ompi_comm_size(comm); /* initialize data */ output_data=NULL; my_rank=ompi_comm_rank(comm); my_proc=ompi_comm_peer_lookup(comm,my_rank); for( proc=0 ; proc < n_procs_in ; proc++) { if( procs[proc]==my_proc) { my_index=proc; } } /*create a new module*/ module=OBJ_NEW(mca_sbgp_basesmsocket_module_t); if (!module ) { return NULL; } module->super.group_size=0; module->super.group_comm = comm; module->super.group_list = NULL; module->super.group_net = OMPI_SBGP_SOCKET; /* ** get my process affinity information ** */ /* get the number of processors on this node */ ret=opal_paffinity_base_get_processor_info(&num_processors); /* get process affinity mask */ OPAL_PAFFINITY_CPU_ZERO(my_cpu_set); ret=opal_paffinity_base_get(&my_cpu_set); OPAL_PAFFINITY_PROCESS_IS_BOUND(my_cpu_set,&bound); /*debug process affinity*/ /* { ret=opal_paffinity_base_get_socket_info(&num_socket); fprintf(stderr,"Number of sockets %d\n",num_socket); fprintf(stderr,"Test if rank %d is bound %d\n", my_rank, bound); fprintf(stderr,"return from opal_paffinity_base_get: %d\n\n",ret); fprintf(stderr,"bitmask elements: "); unsigned int long jj; for(jj=0; jj < OPAL_PAFFINITY_BITMASK_NUM_ELEMENTS; jj++) fprintf(stderr," %d ",my_cpu_set.bitmask[jj]); fprintf(stderr,"\n"); fflush(stderr); } end debug process affinity*/ if( !bound ) { /* pa affinity not set, so socket index will be set to -1 */ my_socket_index=-1; /*debug print*/ /* */ fprintf(stderr,"[%d]FAILED to set basesmsocket group !!!\n",my_rank); fflush(stderr); /*end debug*/ goto NoLocalPeers; } else { my_socket_index=-1; /* loop over number of processors */ for ( proc=0 ; proc < num_processors ; proc++ ) { if (OPAL_PAFFINITY_CPU_ISSET(proc,my_cpu_set)) { ret=opal_paffinity_base_get_map_to_socket_core(proc,&socket_tmp,&core_index); if( my_socket_index != socket_tmp ) { my_socket_index=socket_tmp; break; } } } /* end of proc loop */ } /* Debug prints */ /* { fprintf(stderr,"Number of processors per node: %d\n",num_processors); fprintf(stderr,"I am rank %d and my socket index is %d\n and my core index is %d\n",my_rank,my_socket_index,core_index); fprintf(stderr,"n_proc_in = %d\n",n_procs_in); fprintf(stderr,"\n"); fflush(stderr); } end debug prints */ /*get my socket index*/ cnt=0; for( proc=0 ; proc < n_procs_in ; proc++) { local=OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if( local ) { cnt++; } } /*debug print */ /* fprintf(stderr,"Number of local processors %d\n",cnt); end debug print*/ /* if no other local procs found skip to end */ if( 1 >= cnt ) { goto NoLocalPeers; } #if 0 int *local_ranks_in_comm; int32_t *socket_info, *my_socket_info; int my_local_index; #endif /* allocate structure to hold the list of local ranks */ local_ranks_in_comm=(int *)malloc(sizeof(int)*cnt); if(NULL == local_ranks_in_comm ) { goto Error; } /* figure out which ranks from the input communicator - comm - will * particiapte in the local socket determination. */ n_local_peers=0; i_cnt=0; for( proc = 0; proc < n_procs_in; proc++) { local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if ( local ) { /* set the rank within the on-host ranks - this will be used for tha * allgather */ if( my_proc == procs[proc] ) { my_local_index=n_local_peers; } /* find the rank of the current proc in comm. We take advantage * of the fact that ranks in a group have the same relative * ordering as they do within the communicator. */ #if 1 /*for( lp_cnt=i_cnt; lp_cnt < comm_size ; lp_cnt++ ) {*/ for( lp_cnt=proc; lp_cnt < comm_size ; lp_cnt++ ) { if(procs[proc] == ompi_comm_peer_lookup(comm,lp_cnt) ) { local_ranks_in_comm[i_cnt]=lp_cnt; /* lp_cnt has alrady been checked */ i_cnt++; /* found the corresponding rank in comm, so don't need * to search any more */ break; } /*i_cnt++;*/ /*fprintf(stderr,"QQQ i_cnt %d \n",i_cnt);*/ } #endif n_local_peers++; } } /*fprintf(stderr,"YYY n_local_peers %d\n",n_local_peers);*/ socket_info=(int *)malloc(sizeof(int)*n_local_peers); /*fprintf(stderr,"XXX got socket info\n");*/ if(NULL == socket_info ) { goto Error; } my_socket_info=my_socket_index; /* Allgather data over the communicator */ ret=comm_allgather_pml(&my_socket_info, socket_info, 1, MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm); if (OMPI_SUCCESS != ret ) { fprintf(stderr," comm_allgather_pml returned error %d \n", ret); fflush(stderr); return NULL; } /*allocate memory to the group_list probably an overestimation of the necessary resources */ module->super.group_list=(int *)malloc(sizeof(int)*cnt); if(NULL == module->super.group_list) { goto Error; } /* figure out who is sharing the same socket */ cnt=0; for (proc = 0; proc < n_local_peers; proc++) { int rem_rank=local_ranks_in_comm[proc]; int rem_socket_index=socket_info[proc]; /*Populate the list*/ if (rem_socket_index == my_socket_index) { module->super.group_list[cnt]=rem_rank; cnt++; } } module->super.group_size=cnt; /*debug print*/ /* { int ii; fprintf(stderr,"Ranks per socket: %d\n",cnt); fprintf(stderr,"Socket %d owns ranks: ", my_socket_index); for (ii=0; ii < cnt; ii++) fprintf(stderr,"%d ",module->super.group_list[ii]); fprintf(stderr,"\n"); fflush(stderr); } { cpu_set_t set; unsigned int len = sizeof(set); int i; unsigned long mask = 0; CPU_ZERO(&set); if (sched_getaffinity(0, len, &set) < 0) { perror("sched_getaffinity"); return -1; } for (i = 0; i < CPU_SETSIZE; i++) { int cpu = CPU_ISSET(i, &set); if (cpu) { mask |= 1<< i; } } opal_output(0,"%d: my affinity mask is: %08lx\n", my_local_index,mask); } end debug*/ /*Free resources*/ free(local_ranks_in_comm); free(socket_info); /*Return the module*/ return (mca_sbgp_base_module_t *) module; NoLocalPeers: /* nothing to store, so just free the module and return */ /*fprintf(stderr,"No local socket peers\n");*/ /*free(module);*/ if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; Error: /*clean up*/ if( NULL != module->super.group_list) { free(module->super.group_list); module->super.group_list=NULL; } if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; }
static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) { int rc=ORTE_SUCCESS; opal_list_item_t *item; orte_namelist_t *nm; opal_buffer_t tmp_buf; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:hier entering allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* have I initialized my local info? */ if (!coll_initialized) { orte_process_name_t proc; orte_vpid_t v; /* get my local rank so I can locally cache it */ my_local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); /* if I am local_rank=0 for this node and job, then setup * my array of local_rank=0 peers */ if (0 == my_local_rank) { /* we need one entry/node in this job */ my_coll_peers = (orte_vpid_t*)malloc(orte_process_info.num_nodes * sizeof(orte_vpid_t)); cpeers = 0; } /* cycle through the procs to create a list of those that are local to me */ proc.jobid = ORTE_PROC_MY_NAME->jobid; for (v=0; v < orte_process_info.num_procs; v++) { proc.vpid = v; ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); /* is this proc local_rank=0 on its node? */ if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { my_coll_peers[cpeers++] = v; } /* if this is me, or this proc isn't on our node, ignore it */ if (v == ORTE_PROC_MY_NAME->vpid || !OPAL_PROC_ON_LOCAL_NODE(orte_ess.proc_get_locality(&proc))) { continue; } /* add this proc to our list of local peers */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = proc.jobid; nm->name.vpid = proc.vpid; ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); opal_list_append(&my_local_peers, &nm->item); /* if I am not local_rank=0, is this one? */ if (0 != my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { my_local_rank_zero_proc.jobid = proc.jobid; my_local_rank_zero_proc.vpid = proc.vpid; ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); } } /* compute the number of local peers - note that this number * does not include me!! */ num_local_peers = opal_list_get_size(&my_local_peers); /* flag that I have initialized things */ coll_initialized = true; } /* if I am not local rank = 0 */ if (0 != my_local_rank) { if (ORTE_VPID_INVALID == my_local_rank_zero_proc.vpid) { /* something is broken */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* setup the collective */ OPAL_THREAD_LOCK(&allgather.lock); allgather.recvd = 0; /* reset the collector */ OBJ_DESTRUCT(&allgather.results); OBJ_CONSTRUCT(&allgather.results, opal_buffer_t); OPAL_THREAD_UNLOCK(&allgather.lock); /* send our data to the local_rank=0 proc on this node */ if (0 > (rc = orte_rml.send_buffer(&my_local_rank_zero_proc, sbuf, ORTE_RML_TAG_ALLGATHER, 0))) { ORTE_ERROR_LOG(rc); return rc; } /* now receive the final result. Be sure to do this in * a manner that allows us to return without being in a recv! */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_NON_PERSISTENT, allgather_recv, &allgather); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* wait to complete - we will receive a single message * sent from our local_rank=0 peer */ OPAL_THREAD_LOCK(&allgather.lock); while (allgather.recvd < 1) { opal_condition_wait(&allgather.cond, &allgather.lock); } /* copy payload to the caller's buffer */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, &allgather.results))) { ORTE_ERROR_LOG(rc); } OPAL_THREAD_UNLOCK(&allgather.lock); } else { /* I am local_rank = 0 on this node! */ /* setup the collective */ OPAL_THREAD_LOCK(&allgather.lock); allgather.recvd = 0; /* reset the collector */ OBJ_DESTRUCT(&allgather.results); OBJ_CONSTRUCT(&allgather.results, opal_buffer_t); /* seed with my data */ opal_dss.copy_payload(&allgather.results, sbuf); OPAL_THREAD_UNLOCK(&allgather.lock); /* wait to receive their data. Be sure to do this in * a manner that allows us to return without being in a recv! */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_PERSISTENT, allgather_recv, &allgather); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* wait to complete - we need to receive input from every * local peer (excluding myself) */ OPAL_THREAD_LOCK(&allgather.lock); while (allgather.recvd < num_local_peers) { opal_condition_wait(&allgather.cond, &allgather.lock); } /* xfer to the tmp buf in case another allgather comes along */ OBJ_CONSTRUCT(&tmp_buf, opal_buffer_t); opal_dss.copy_payload(&tmp_buf, &allgather.results); OPAL_THREAD_UNLOCK(&allgather.lock); /* cancel the lingering recv */ orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER); /* take the recv'd data and use one of the base collectives * to exchange it with all other local_rank=0 procs in a scalable * manner - the exact collective will depend upon the number of * nodes in the job */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_allgather(&tmp_buf, rbuf, num_local_peers + 1, ORTE_PROC_MY_NAME->jobid, cpeers, my_coll_peers))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&tmp_buf); return rc; } OBJ_DESTRUCT(&tmp_buf); /* done with this */ /* distribute the results to our local peers */ for (item = opal_list_get_first(&my_local_peers); item != opal_list_get_end(&my_local_peers); item = opal_list_get_next(item)) { nm = (orte_namelist_t*)item; if (0 > (rc = orte_rml.send_buffer(&nm->name, rbuf, ORTE_RML_TAG_ALLGATHER, 0))) { ORTE_ERROR_LOG(rc); return rc; } } } OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:hier allgather completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) { spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL; spml_ikrit_mxm_ep_conn_info_t my_ep_info; size_t mxm_addr_len = MXM_MAX_ADDR_LEN; mxm_error_t err; size_t i, n; int rc = OSHMEM_ERROR; ompi_proc_t *proc_self; int my_rank = oshmem_my_proc_id(); OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t); /* Allocate connection requests */ ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } if (mca_spml_ikrit.hw_rdma_channel) { ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_hw_rdma_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } } mca_spml_ikrit.mxm_peers = (mxm_peer_t *) calloc(nprocs , sizeof(mxm_peer_t)); if (NULL == mca_spml_ikrit.mxm_peers) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } memset(&my_ep_info, 0, sizeof(my_ep_info)); if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, mxm_error_string(err)); rc = OSHMEM_ERROR; goto bail; } oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); } err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, mxm_error_string(err)); rc = OSHMEM_ERROR; goto bail; } oshmem_shmem_allgather(&my_ep_info, ep_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); opal_progress_register(spml_ikrit_progress); /* Get the EP connection requests for all the processes from modex */ for (n = 0; n < nprocs; ++n) { /* mxm 2.0 keeps its connections on a list. Make sure * that list have different order on every rank */ i = (my_rank + n) % nprocs; mxm_peer_construct(&mca_spml_ikrit.mxm_peers[i]); err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i].mxm_conn, &mca_spml_ikrit.mxm_peers[i]); if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } } else { mca_spml_ikrit.mxm_peers[i].mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i].mxm_conn; } } if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); if (mca_spml_ikrit.bulk_connect) { /* Need a barrier to ensure remote peers already created connection */ oshmem_shmem_barrier(); mxm_ep_wireup(mca_spml_ikrit.mxm_ep); } proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank); /* identify local processes and change transport to SHM */ for (i = 0; i < nprocs; i++) { if (procs[i]->super.proc_name.jobid != proc_self->super.proc_name.jobid || !OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) { continue; } if (procs[i] == proc_self) continue; /* use zcopy for put/get via sysv shared memory with fallback to RDMA */ mca_spml_ikrit.mxm_peers[i].ptl_id = MXM_PTL_SHM; } SPML_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; bail: if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); SPML_ERROR("add procs FAILED rc=%d", rc); return rc; }
static mca_mpool_base_module_t * mca_mpool_sm_init(struct mca_mpool_base_resources_t *resources) { mca_mpool_sm_module_t *mpool_module; mca_allocator_base_component_t* allocator_component; ompi_proc_t **procs; size_t num_all_procs, i, num_local_procs = 0; /* README: this needs to change if procs in different jobs (even * spawned ones) are to talk using shared memory */ if (NULL == (procs = ompi_proc_world(&num_all_procs))) { /* out of resources, so just bail */ return NULL; } for (i = 0 ; i < num_all_procs ; ++i) { if (OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) { num_local_procs++; } } /* Make a new mpool module */ mpool_module = (mca_mpool_sm_module_t *)malloc(sizeof(mca_mpool_sm_module_t)); mca_mpool_sm_module_init(mpool_module); /* set sm_size */ mpool_module->sm_size = resources->size; /* clip at the min size */ if (mpool_module->sm_size < (long) ompi_mpool_sm_min_size) { mpool_module->sm_size = (long) ompi_mpool_sm_min_size; } allocator_component = mca_allocator_component_lookup( mca_mpool_sm_component.sm_allocator_name); /* if specified allocator cannot be loaded - look for an alternative */ if (NULL == allocator_component) { if (opal_list_get_size(&ompi_allocator_base_framework.framework_components) == 0) { mca_base_component_list_item_t *item = (mca_base_component_list_item_t *) opal_list_get_first(&ompi_allocator_base_framework.framework_components); allocator_component = (mca_allocator_base_component_t *)item->cli_component; opal_output( 0, "mca_mpool_sm_init: " "unable to locate allocator: %s - using %s\n", mca_mpool_sm_component.sm_allocator_name, allocator_component->allocator_version.mca_component_name); } else { opal_output(0, "mca_mpool_sm_init: " "unable to locate allocator: %s\n", mca_mpool_sm_component.sm_allocator_name); free(procs); return NULL; } } mpool_module->mem_node = resources->mem_node; opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: shared memory size used: (%ld)", mpool_module->sm_size); if (NULL == (mpool_module->sm_common_module = mca_common_sm_module_attach(&resources->bs_meta_buf, sizeof(mca_common_sm_module_t), 8))) { opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: " "unable to create shared memory mapping (%s)", resources->bs_meta_buf.seg_name); free(mpool_module); free(procs); return NULL; } free(procs); /* setup allocator */ mpool_module->sm_allocator = allocator_component->allocator_init(true, mca_common_sm_seg_alloc, NULL, &(mpool_module->super)); if (NULL == mpool_module->sm_allocator) { opal_output(0, "mca_mpool_sm_init: unable to initialize allocator"); free(mpool_module); return NULL; } return &mpool_module->super; }
static int ec_on_local_node (rte_ec_handle_t ec, rte_grp_handle_t group) { ompi_proc_t *proc = (ompi_proc_t *)ec.handle; return OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags); }
int mca_btl_sm_add_procs( struct mca_btl_base_module_t* btl, size_t nprocs, struct ompi_proc_t **procs, struct mca_btl_base_endpoint_t **peers, opal_bitmap_t* reachability) { int return_code = OMPI_SUCCESS; int32_t n_local_procs = 0, proc, j, my_smp_rank = -1; ompi_proc_t* my_proc; /* pointer to caller's proc structure */ mca_btl_sm_t *sm_btl; bool have_connected_peer = false; char **bases; /* for easy access to the mpool_sm_module */ mca_mpool_sm_module_t *sm_mpool_modp = NULL; /* initializion */ sm_btl = (mca_btl_sm_t *)btl; /* get pointer to my proc structure */ if(NULL == (my_proc = ompi_proc_local())) return OMPI_ERR_OUT_OF_RESOURCE; /* Get unique host identifier for each process in the list, * and idetify procs that are on this host. Add procs on this * host to shared memory reachbility list. Also, get number * of local procs in the procs list. */ for (proc = 0; proc < (int32_t)nprocs; proc++) { /* check to see if this proc can be reached via shmem (i.e., if they're on my local host and in my job) */ if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid || !OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) { peers[proc] = NULL; continue; } /* check to see if this is me */ if(my_proc == procs[proc]) { my_smp_rank = mca_btl_sm_component.my_smp_rank = n_local_procs++; continue; } /* sm doesn't support heterogeneous yet... */ if (procs[proc]->proc_arch != my_proc->proc_arch) { continue; } /* we have someone to talk to */ have_connected_peer = true; if(!(peers[proc] = create_sm_endpoint(n_local_procs, procs[proc]))) { return_code = OMPI_ERROR; goto CLEANUP; } n_local_procs++; /* add this proc to shared memory accessibility list */ return_code = opal_bitmap_set_bit(reachability, proc); if(OMPI_SUCCESS != return_code) goto CLEANUP; } /* jump out if there's not someone we can talk to */ if (!have_connected_peer) goto CLEANUP; /* make sure that my_smp_rank has been defined */ if (-1 == my_smp_rank) { return_code = OMPI_ERROR; goto CLEANUP; } if (!sm_btl->btl_inited) { return_code = sm_btl_first_time_init(sm_btl, my_smp_rank, mca_btl_sm_component.sm_max_procs); if (return_code != OMPI_SUCCESS) { goto CLEANUP; } } /* set local proc's smp rank in the peers structure for * rapid access and calculate reachability */ for(proc = 0; proc < (int32_t)nprocs; proc++) { if(NULL == peers[proc]) continue; mca_btl_sm_component.sm_peers[peers[proc]->peer_smp_rank] = peers[proc]; peers[proc]->my_smp_rank = my_smp_rank; } bases = mca_btl_sm_component.shm_bases; sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_sm_component.sm_mpool; /* initialize own FIFOs */ /* * The receiver initializes all its FIFOs. All components will * be allocated near the receiver. Nothing will be local to * "the sender" since there will be many senders. */ for(j = mca_btl_sm_component.num_smp_procs; j < mca_btl_sm_component.num_smp_procs + FIFO_MAP_NUM(n_local_procs); j++) { return_code = sm_fifo_init( mca_btl_sm_component.fifo_size, mca_btl_sm_component.sm_mpool, &mca_btl_sm_component.fifo[my_smp_rank][j], mca_btl_sm_component.fifo_lazy_free); if(return_code != OMPI_SUCCESS) goto CLEANUP; } opal_atomic_wmb(); /* Sync with other local procs. Force the FIFO initialization to always * happens before the readers access it. */ opal_atomic_add_32(&mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1); while( n_local_procs > mca_btl_sm_component.sm_seg->module_seg->seg_inited) { opal_progress(); opal_atomic_rmb(); } /* it is now safe to unlink the shared memory segment. only one process * needs to do this, so just let smp rank zero take care of it. */ if (0 == my_smp_rank) { if (OMPI_SUCCESS != mca_common_sm_module_unlink(mca_btl_sm_component.sm_seg)) { /* it is "okay" if this fails at this point. we have gone this far, * so just warn about the failure and continue. this is probably * only triggered by a programming error. */ opal_output(0, "WARNING: common_sm_module_unlink failed.\n"); } /* SKG - another abstraction violation here, but I don't want to add * extra code in the sm mpool for further synchronization. */ /* at this point, all processes have attached to the mpool segment. so * it is safe to unlink it here. */ if (OMPI_SUCCESS != mca_common_sm_module_unlink(sm_mpool_modp->sm_common_module)) { opal_output(0, "WARNING: common_sm_module_unlink failed.\n"); } if (-1 == unlink(mca_btl_sm_component.sm_mpool_rndv_file_name)) { opal_output(0, "WARNING: %s unlink failed.\n", mca_btl_sm_component.sm_mpool_rndv_file_name); } if (-1 == unlink(mca_btl_sm_component.sm_rndv_file_name)) { opal_output(0, "WARNING: %s unlink failed.\n", mca_btl_sm_component.sm_rndv_file_name); } } /* free up some space used by the name buffers */ free(mca_btl_sm_component.sm_mpool_ctl_file_name); free(mca_btl_sm_component.sm_mpool_rndv_file_name); free(mca_btl_sm_component.sm_ctl_file_name); free(mca_btl_sm_component.sm_rndv_file_name); /* coordinate with other processes */ for(j = mca_btl_sm_component.num_smp_procs; j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) { ptrdiff_t diff; /* spin until this element is allocated */ /* doesn't really wait for that process... FIFO might be allocated, but not initialized */ opal_atomic_rmb(); while(NULL == mca_btl_sm_component.shm_fifo[j]) { opal_progress(); opal_atomic_rmb(); } /* Calculate the difference as (my_base - their_base) */ diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]); /* store local address of remote fifos */ mca_btl_sm_component.fifo[j] = (sm_fifo_t*)OFFSET2ADDR(diff, mca_btl_sm_component.shm_fifo[j]); /* cache local copy of peer memory node number */ mca_btl_sm_component.mem_nodes[j] = mca_btl_sm_component.shm_mem_nodes[j]; } /* update the local smp process count */ mca_btl_sm_component.num_smp_procs += n_local_procs; /* make sure we have enough eager fragmnents for each process */ return_code = ompi_free_list_resize_mt(&mca_btl_sm_component.sm_frags_eager, mca_btl_sm_component.num_smp_procs * 2); if (OMPI_SUCCESS != return_code) goto CLEANUP; CLEANUP: return return_code; }
int mca_spml_ikrit_add_procs(ompi_proc_t** procs, size_t nprocs) { spml_ikrit_mxm_ep_conn_info_t *ep_info = NULL; spml_ikrit_mxm_ep_conn_info_t *ep_hw_rdma_info = NULL; spml_ikrit_mxm_ep_conn_info_t my_ep_info = {{0}}; #if MXM_API < MXM_VERSION(2,0) mxm_conn_req_t *conn_reqs; int timeout; #else size_t mxm_addr_len = MXM_MAX_ADDR_LEN; #endif mxm_error_t err; size_t i, n; int rc = OSHMEM_ERROR; ompi_proc_t *proc_self; int my_rank = oshmem_my_proc_id(); OBJ_CONSTRUCT(&mca_spml_ikrit.active_peers, opal_list_t); /* Allocate connection requests */ #if MXM_API < MXM_VERSION(2,0) conn_reqs = malloc(nprocs * sizeof(mxm_conn_req_t)); if (NULL == conn_reqs) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } memset(conn_reqs, 0x0, sizeof(mxm_conn_req_t)); #endif ep_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } if (mca_spml_ikrit.hw_rdma_channel) { ep_hw_rdma_info = calloc(sizeof(spml_ikrit_mxm_ep_conn_info_t), nprocs); if (NULL == ep_hw_rdma_info) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } } mca_spml_ikrit.mxm_peers = (mxm_peer_t **) malloc(nprocs * sizeof(*(mca_spml_ikrit.mxm_peers))); if (NULL == mca_spml_ikrit.mxm_peers) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } #if MXM_API < MXM_VERSION(2,0) if (OSHMEM_SUCCESS != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_SELF)) { rc = OSHMEM_ERROR; goto bail; } if (OSHMEM_SUCCESS != spml_ikrit_get_ep_address(&my_ep_info, MXM_PTL_RDMA)) { rc = OSHMEM_ERROR; goto bail; } #else if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_get_address(mca_spml_ikrit.mxm_hw_rdma_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, mxm_error_string(err)); rc = OSHMEM_ERROR; goto bail; } oshmem_shmem_allgather(&my_ep_info, ep_hw_rdma_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); } err = mxm_ep_get_address(mca_spml_ikrit.mxm_ep, &my_ep_info.addr.ep_addr, &mxm_addr_len); if (MXM_OK != err) { orte_show_help("help-oshmem-spml-ikrit.txt", "unable to get endpoint address", true, mxm_error_string(err)); rc = OSHMEM_ERROR; goto bail; } #endif oshmem_shmem_allgather(&my_ep_info, ep_info, sizeof(spml_ikrit_mxm_ep_conn_info_t)); opal_progress_register(spml_ikrit_progress); /* Get the EP connection requests for all the processes from modex */ for (n = 0; n < nprocs; ++n) { /* mxm 2.0 keeps its connections on a list. Make sure * that list have different order on every rank */ i = (my_rank + n) % nprocs; mca_spml_ikrit.mxm_peers[i] = OBJ_NEW(mxm_peer_t); if (NULL == mca_spml_ikrit.mxm_peers[i]) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; goto bail; } mca_spml_ikrit.mxm_peers[i]->pe = i; #if MXM_API < MXM_VERSION(2,0) conn_reqs[i].ptl_addr[MXM_PTL_SELF] = (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_SELF]; conn_reqs[i].ptl_addr[MXM_PTL_SHM] = NULL; conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = (struct sockaddr *) &ep_info[i].addr.ptl_addr[MXM_PTL_RDMA]; #else err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, ep_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } if (OSHMEM_SUCCESS != create_ptl_idx(i)) goto bail; mxm_conn_ctx_set(mca_spml_ikrit.mxm_peers[i]->mxm_conn, mca_spml_ikrit.mxm_peers[i]); if (mca_spml_ikrit.hw_rdma_channel) { err = mxm_ep_connect(mca_spml_ikrit.mxm_hw_rdma_ep, ep_hw_rdma_info[i].addr.ep_addr, &mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); goto bail; } } else { mca_spml_ikrit.mxm_peers[i]->mxm_hw_rdma_conn = mca_spml_ikrit.mxm_peers[i]->mxm_conn; } #endif } #if MXM_API < MXM_VERSION(2,0) /* Connect to remote peers */ if (mxm_get_version() < MXM_VERSION(1,5)) { timeout = 1000; } else { timeout = -1; } err = mxm_ep_connect(mca_spml_ikrit.mxm_ep, conn_reqs, nprocs, timeout); if (MXM_OK != err) { SPML_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); for (i = 0; i < nprocs; ++i) { if (MXM_OK != conn_reqs[i].error) { SPML_ERROR("MXM EP connect to %s error: %s\n", procs[i]->proc_hostname, mxm_error_string(conn_reqs[i].error)); } } rc = OSHMEM_ERR_CONNECTION_FAILED; goto bail; } /* Save returned connections */ for (i = 0; i < nprocs; ++i) { mca_spml_ikrit.mxm_peers[i]->mxm_conn = conn_reqs[i].conn; if (OSHMEM_SUCCESS != create_ptl_idx(i)) { rc = OSHMEM_ERR_CONNECTION_FAILED; goto bail; } mxm_conn_ctx_set(conn_reqs[i].conn, mca_spml_ikrit.mxm_peers[i]); } if (conn_reqs) free(conn_reqs); #endif if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); #if MXM_API >= MXM_VERSION(2,0) if (mca_spml_ikrit.bulk_connect) { /* Need a barrier to ensure remote peers already created connection */ oshmem_shmem_barrier(); mxm_ep_wireup(mca_spml_ikrit.mxm_ep); } #endif proc_self = oshmem_proc_group_find(oshmem_group_all, my_rank); /* identify local processes and change transport to SHM */ for (i = 0; i < nprocs; i++) { if (procs[i]->super.proc_name.jobid != proc_self->super.proc_name.jobid || !OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) { continue; } if (procs[i] == proc_self) continue; /* use zcopy for put/get via sysv shared memory */ OSHMEM_PROC_DATA(procs[i])->transport_ids[0] = MXM_PTL_SHM; OSHMEM_PROC_DATA(procs[i])->transport_ids[1] = MXM_PTL_RDMA; OSHMEM_PROC_DATA(procs[i])->num_transports = 2; } SPML_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; bail: #if MXM_API < MXM_VERSION(2,0) if (conn_reqs) free(conn_reqs); #endif if (ep_info) free(ep_info); if (ep_hw_rdma_info) free(ep_hw_rdma_info); SPML_ERROR("add procs FAILED rc=%d", rc); return rc; }