OSHMEM_DECLSPEC int oshmem_shmem_exchange_allgather(void *buf, int buf_size) { int rc = OSHMEM_SUCCESS; int i = 0; int *ranks_in_comm = NULL; ranks_in_comm = (int *) malloc(orte_process_info.num_procs * sizeof(int)); if (NULL == ranks_in_comm) { return OSHMEM_ERR_OUT_OF_RESOURCE; } for (i = 0; i < (int) orte_process_info.num_procs; ++i) { ranks_in_comm[i] = i; } void* buf_temp = malloc(buf_size); memcpy(buf_temp, (char*)buf + buf_size * ORTE_PROC_MY_NAME->vpid, buf_size); rc = comm_allgather_pml( buf_temp, buf, buf_size, MPI_BYTE, ORTE_PROC_MY_NAME->vpid, orte_process_info.num_procs, ranks_in_comm, (ompi_communicator_t *) &ompi_mpi_comm_world); if (ranks_in_comm) free(ranks_in_comm); if (buf_temp) free(buf_temp); return rc; }
int bcol_basesmuma_smcm_allgather_connection( mca_bcol_basesmuma_module_t *sm_bcol_module, mca_sbgp_base_module_t *module, opal_list_t *peer_list, bcol_basesmuma_smcm_proc_item_t ***back_files, ompi_communicator_t *comm, bcol_basesmuma_smcm_file_t input, char *base_fname, bool map_all) { /* define local variables */ int rc, i, fd; ptrdiff_t mem_offset; ompi_proc_t *proc_temp, *my_id; bcol_basesmuma_smcm_proc_item_t *temp; bcol_basesmuma_smcm_proc_item_t *item_ptr; bcol_basesmuma_smcm_proc_item_t **backing_files; struct file_info_t local_file; struct file_info_t *all_files=NULL; /* sanity check */ if (strlen(input.file_name) > SM_BACKING_FILE_NAME_MAX_LEN-1) { opal_output (ompi_bcol_base_framework.framework_output, "backing file name too long: %s len :: %d", input.file_name, (int) strlen(input.file_name)); return OMPI_ERR_BAD_PARAM; } backing_files = (bcol_basesmuma_smcm_proc_item_t **) calloc(module->group_size, sizeof(bcol_basesmuma_smcm_proc_item_t *)); if (!backing_files) { return OMPI_ERR_OUT_OF_RESOURCE; } /* FIXME *back_files might have been already allocated * so free it in order to avoid a memory leak */ if (NULL != *back_files) { free (*back_files); } *back_files = backing_files; my_id = ompi_proc_local(); /* Phase One: gather a list of processes that will participate in the allgather - I'm preparing this list from the sbgp-ing module that was passed into the function */ /* fill in local file information */ local_file.vpid = ((orte_process_name_t*)&my_id->super.proc_name)->vpid; local_file.jobid = ((orte_process_name_t*)&my_id->super.proc_name)->jobid; local_file.file_size=input.size; local_file.size_ctl_structure=input.size_ctl_structure; local_file.data_seg_alignment=input.data_seg_alignment; strcpy (local_file.file_name, input.file_name); /* will exchange this data type as a string of characters - * this routine is first called before MPI_init() completes * and before error handling is setup, so can't use the * MPI data types to send this data */ all_files = (struct file_info_t *) calloc(module->group_size, sizeof (struct file_info_t)); if (!all_files) { return OMPI_ERR_OUT_OF_RESOURCE; } /* exchange data */ rc = comm_allgather_pml(&local_file,all_files,sizeof(struct file_info_t), MPI_CHAR, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != rc ) { opal_output (ompi_bcol_base_framework.framework_output, "failed in comm_allgather_pml. Error code: %d", rc); goto Error; } /* Phase four: loop through the receive buffer, unpack the data recieved from remote peers */ for (i = 0; i < module->group_size; i++) { struct file_info_t *rem_file = all_files + i; /* check if this is my index or if the file is already mapped (set above). ther * is no reason to look through the peer list again because no two members of * the group will have the same vpid/jobid pair. ignore this previously found * mapping if map_all was requested (NTH: not sure why exactly since we re-map * and already mapped file) */ if (sm_bcol_module->super.sbgp_partner_module->my_index == i) { continue; } proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]); OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) { /* if the vpid/jobid/filename combination already exists in the list, then do not map this peer's file --- because you already have */ if (0 == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, OMPI_CAST_RTE_NAME(&proc_temp->super.proc_name), &item_ptr->peer) && 0 == strcmp (item_ptr->sm_file.file_name, rem_file->file_name)) { ++item_ptr->refcnt; /* record file data */ backing_files[i] = item_ptr; break; } } if (!map_all && backing_files[i]) { continue; } temp = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t); if (!temp) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto Error; } temp->peer.vpid = rem_file->vpid; temp->peer.jobid = rem_file->jobid; temp->sm_file.file_name = strdup (rem_file->file_name); if (!temp->sm_file.file_name) { rc = OMPI_ERR_OUT_OF_RESOURCE; OBJ_RELEASE(temp); goto Error; } temp->sm_file.size = (size_t) rem_file->file_size; temp->sm_file.mpool_size = (size_t) rem_file->file_size; temp->sm_file.size_ctl_structure = (size_t) rem_file->size_ctl_structure; temp->sm_file.data_seg_alignment = (size_t) rem_file->data_seg_alignment; temp->refcnt = 1; /* Phase Five: If map_all == true, then we map every peer's file else we check to see if I have already mapped this vpid/jobid/filename combination and if I have, then I do not mmap this peer's file. * */ fd = open(temp->sm_file.file_name, O_RDWR, 0600); if (0 > fd) { opal_output (ompi_bcol_base_framework.framework_output, "SMCM Allgather failed to open sm backing file %s. errno = %d", temp->sm_file.file_name, errno); rc = OMPI_ERROR; goto Error; } /* map the file */ temp->sm_mmap = bcol_basesmuma_smcm_reg_mmap (NULL, fd, temp->sm_file.size, temp->sm_file.size_ctl_structure, temp->sm_file.data_seg_alignment, temp->sm_file.file_name); close (fd); if (NULL == temp->sm_mmap) { opal_output (ompi_bcol_base_framework.framework_output, "mmapping failed to map remote peer's file"); OBJ_RELEASE(temp); rc = OMPI_ERROR; goto Error; } /* compute memory offset */ mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr - (ptrdiff_t) temp->sm_mmap->map_seg; temp->sm_mmap->map_seg->seg_offset = mem_offset; temp->sm_mmap->map_seg->seg_size = temp->sm_file.size - mem_offset; /* more stuff to follow */ /* append this peer's info, including shared memory map addr, onto the peer_list */ /* record file data */ backing_files[i] = (bcol_basesmuma_smcm_proc_item_t *) temp; opal_list_append(peer_list, (opal_list_item_t*) temp); } rc = OMPI_SUCCESS; Error: /* error clean-up and return */ if (NULL != all_files) { free(all_files); } return rc; }
static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_proc_t ** procs, int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data ) { /* local variables */ mca_sbgp_basesmsocket_module_t *module; /* opal_buffer_t* sbuffer = OBJ_NEW(opal_buffer_t); opal_buffer_t* rbuffer = OBJ_NEW(opal_buffer_t); */ opal_paffinity_base_cpu_set_t my_cpu_set; bool bound; int ret; int num_processors; int socket_tmp; int my_socket_index; int core_index=-1; int proc, cnt, local, n_local_peers, my_index, my_rank; ompi_proc_t* my_proc; int *local_ranks_in_comm=NULL; int *socket_info=NULL, my_socket_info; int i_cnt, lp_cnt, my_local_index, comm_size=ompi_comm_size(comm); /* initialize data */ output_data=NULL; my_rank=ompi_comm_rank(comm); my_proc=ompi_comm_peer_lookup(comm,my_rank); for( proc=0 ; proc < n_procs_in ; proc++) { if( procs[proc]==my_proc) { my_index=proc; } } /*create a new module*/ module=OBJ_NEW(mca_sbgp_basesmsocket_module_t); if (!module ) { return NULL; } module->super.group_size=0; module->super.group_comm = comm; module->super.group_list = NULL; module->super.group_net = OMPI_SBGP_SOCKET; /* ** get my process affinity information ** */ /* get the number of processors on this node */ ret=opal_paffinity_base_get_processor_info(&num_processors); /* get process affinity mask */ OPAL_PAFFINITY_CPU_ZERO(my_cpu_set); ret=opal_paffinity_base_get(&my_cpu_set); OPAL_PAFFINITY_PROCESS_IS_BOUND(my_cpu_set,&bound); /*debug process affinity*/ /* { ret=opal_paffinity_base_get_socket_info(&num_socket); fprintf(stderr,"Number of sockets %d\n",num_socket); fprintf(stderr,"Test if rank %d is bound %d\n", my_rank, bound); fprintf(stderr,"return from opal_paffinity_base_get: %d\n\n",ret); fprintf(stderr,"bitmask elements: "); unsigned int long jj; for(jj=0; jj < OPAL_PAFFINITY_BITMASK_NUM_ELEMENTS; jj++) fprintf(stderr," %d ",my_cpu_set.bitmask[jj]); fprintf(stderr,"\n"); fflush(stderr); } end debug process affinity*/ if( !bound ) { /* pa affinity not set, so socket index will be set to -1 */ my_socket_index=-1; /*debug print*/ /* */ fprintf(stderr,"[%d]FAILED to set basesmsocket group !!!\n",my_rank); fflush(stderr); /*end debug*/ goto NoLocalPeers; } else { my_socket_index=-1; /* loop over number of processors */ for ( proc=0 ; proc < num_processors ; proc++ ) { if (OPAL_PAFFINITY_CPU_ISSET(proc,my_cpu_set)) { ret=opal_paffinity_base_get_map_to_socket_core(proc,&socket_tmp,&core_index); if( my_socket_index != socket_tmp ) { my_socket_index=socket_tmp; break; } } } /* end of proc loop */ } /* Debug prints */ /* { fprintf(stderr,"Number of processors per node: %d\n",num_processors); fprintf(stderr,"I am rank %d and my socket index is %d\n and my core index is %d\n",my_rank,my_socket_index,core_index); fprintf(stderr,"n_proc_in = %d\n",n_procs_in); fprintf(stderr,"\n"); fflush(stderr); } end debug prints */ /*get my socket index*/ cnt=0; for( proc=0 ; proc < n_procs_in ; proc++) { local=OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if( local ) { cnt++; } } /*debug print */ /* fprintf(stderr,"Number of local processors %d\n",cnt); end debug print*/ /* if no other local procs found skip to end */ if( 1 >= cnt ) { goto NoLocalPeers; } #if 0 int *local_ranks_in_comm; int32_t *socket_info, *my_socket_info; int my_local_index; #endif /* allocate structure to hold the list of local ranks */ local_ranks_in_comm=(int *)malloc(sizeof(int)*cnt); if(NULL == local_ranks_in_comm ) { goto Error; } /* figure out which ranks from the input communicator - comm - will * particiapte in the local socket determination. */ n_local_peers=0; i_cnt=0; for( proc = 0; proc < n_procs_in; proc++) { local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if ( local ) { /* set the rank within the on-host ranks - this will be used for tha * allgather */ if( my_proc == procs[proc] ) { my_local_index=n_local_peers; } /* find the rank of the current proc in comm. We take advantage * of the fact that ranks in a group have the same relative * ordering as they do within the communicator. */ #if 1 /*for( lp_cnt=i_cnt; lp_cnt < comm_size ; lp_cnt++ ) {*/ for( lp_cnt=proc; lp_cnt < comm_size ; lp_cnt++ ) { if(procs[proc] == ompi_comm_peer_lookup(comm,lp_cnt) ) { local_ranks_in_comm[i_cnt]=lp_cnt; /* lp_cnt has alrady been checked */ i_cnt++; /* found the corresponding rank in comm, so don't need * to search any more */ break; } /*i_cnt++;*/ /*fprintf(stderr,"QQQ i_cnt %d \n",i_cnt);*/ } #endif n_local_peers++; } } /*fprintf(stderr,"YYY n_local_peers %d\n",n_local_peers);*/ socket_info=(int *)malloc(sizeof(int)*n_local_peers); /*fprintf(stderr,"XXX got socket info\n");*/ if(NULL == socket_info ) { goto Error; } my_socket_info=my_socket_index; /* Allgather data over the communicator */ ret=comm_allgather_pml(&my_socket_info, socket_info, 1, MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm); if (OMPI_SUCCESS != ret ) { fprintf(stderr," comm_allgather_pml returned error %d \n", ret); fflush(stderr); return NULL; } /*allocate memory to the group_list probably an overestimation of the necessary resources */ module->super.group_list=(int *)malloc(sizeof(int)*cnt); if(NULL == module->super.group_list) { goto Error; } /* figure out who is sharing the same socket */ cnt=0; for (proc = 0; proc < n_local_peers; proc++) { int rem_rank=local_ranks_in_comm[proc]; int rem_socket_index=socket_info[proc]; /*Populate the list*/ if (rem_socket_index == my_socket_index) { module->super.group_list[cnt]=rem_rank; cnt++; } } module->super.group_size=cnt; /*debug print*/ /* { int ii; fprintf(stderr,"Ranks per socket: %d\n",cnt); fprintf(stderr,"Socket %d owns ranks: ", my_socket_index); for (ii=0; ii < cnt; ii++) fprintf(stderr,"%d ",module->super.group_list[ii]); fprintf(stderr,"\n"); fflush(stderr); } { cpu_set_t set; unsigned int len = sizeof(set); int i; unsigned long mask = 0; CPU_ZERO(&set); if (sched_getaffinity(0, len, &set) < 0) { perror("sched_getaffinity"); return -1; } for (i = 0; i < CPU_SETSIZE; i++) { int cpu = CPU_ISSET(i, &set); if (cpu) { mask |= 1<< i; } } opal_output(0,"%d: my affinity mask is: %08lx\n", my_local_index,mask); } end debug*/ /*Free resources*/ free(local_ranks_in_comm); free(socket_info); /*Return the module*/ return (mca_sbgp_base_module_t *) module; NoLocalPeers: /* nothing to store, so just free the module and return */ /*fprintf(stderr,"No local socket peers\n");*/ /*free(module);*/ if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; Error: /*clean up*/ if( NULL != module->super.group_list) { free(module->super.group_list); module->super.group_list=NULL; } if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; }
static mca_sbgp_base_module_t *mca_sbgp_basesmsocket_select_procs(struct ompi_proc_t ** procs, int n_procs_in, struct ompi_communicator_t *comm, char *key, void *output_data ) { /* local variables */ mca_sbgp_basesmsocket_module_t *module; int ret; int my_socket_index; int proc, cnt, local, n_local_peers, my_rank; ompi_proc_t* my_proc; int *local_ranks_in_comm=NULL; int *socket_info=NULL, my_socket_info; int i_cnt, lp_cnt, my_local_index = -1, comm_size=ompi_comm_size(comm); /* initialize data */ output_data=NULL; my_rank=ompi_comm_rank(comm); my_proc=ompi_comm_peer_lookup(comm,my_rank); /*create a new module*/ module=OBJ_NEW(mca_sbgp_basesmsocket_module_t); if (!module ) { return NULL; } module->super.group_size=0; module->super.group_comm = comm; module->super.group_list = NULL; module->super.group_net = OMPI_SBGP_SOCKET; /* test to see if process is bound */ if( OPAL_BIND_TO_NONE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy) ) { /* pa affinity not set, so socket index will be set to -1 */ my_socket_index=-1; /*debug print*/ /* */ BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group, processes are not bound!!!\n",my_rank)); /*end debug*/ goto NoLocalPeers; } else { my_socket_index=-1; /* this should find my logical socket id which is the socket id we want * physical socket ids are not necessarily unique, logical ones, as defined * by the hwloc API are unique. */ if( OMPI_SUCCESS != mca_sbgp_map_to_logical_socket_id(&my_socket_index)) { BASESMSOCKET_VERBOSE(10, ("[%d] FAILED to set basesmsocket group !!!\n",my_rank)); goto NoLocalPeers; } } /* Debug prints */ /* { fprintf(stderr,"Number of processors per node: %d\n",num_processors); fprintf(stderr,"I am rank %d and my socket index is %d\n and my core index is %d\n",my_rank,my_socket_index,core_index); fprintf(stderr,"n_proc_in = %d\n",n_procs_in); fprintf(stderr,"\n"); fflush(stderr); } end debug prints */ /*get my socket index*/ cnt=0; for( proc=0 ; proc < n_procs_in ; proc++) { local=OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if( local ) { cnt++; } } /*debug print */ /* fprintf(stderr,"Number of local processors %d\n",cnt); end debug print*/ /* if no other local procs found skip to end */ if( 1 >= cnt ) { goto NoLocalPeers; } /* allocate structure to hold the list of local ranks */ local_ranks_in_comm=(int *)malloc(sizeof(int)*cnt); if(NULL == local_ranks_in_comm ) { goto Error; } /* figure out which ranks from the input communicator - comm - will * particiapte in the local socket determination. */ n_local_peers=0; i_cnt=0; for( proc = 0; proc < n_procs_in; proc++) { local = OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags); if ( local ) { /* set the rank within the on-host ranks - this will be used for tha * allgather */ if( my_proc == procs[proc] ) { my_local_index=n_local_peers; } /* find the rank of the current proc in comm. We take advantage * of the fact that ranks in a group have the same relative * ordering as they do within the communicator. */ for( lp_cnt=proc; lp_cnt < comm_size ; lp_cnt++ ) { if(procs[proc] == ompi_comm_peer_lookup(comm,lp_cnt) ) { local_ranks_in_comm[i_cnt]=lp_cnt; /* lp_cnt has alrady been checked */ i_cnt++; /* found the corresponding rank in comm, so don't need * to search any more */ break; } /*i_cnt++;*/ /*fprintf(stderr,"QQQ i_cnt %d \n",i_cnt);*/ } n_local_peers++; } } /*fprintf(stderr,"YYY n_local_peers %d\n",n_local_peers);*/ socket_info=(int *)malloc(sizeof(int)*n_local_peers); /*fprintf(stderr,"XXX got socket info\n");*/ if(NULL == socket_info ) { goto Error; } my_socket_info=my_socket_index; /* Allgather data over the communicator */ ret=comm_allgather_pml(&my_socket_info, socket_info, 1, MPI_INT, my_local_index, n_local_peers, local_ranks_in_comm,comm); if (OMPI_SUCCESS != ret ) { BASESMSOCKET_VERBOSE(10, ("comm_allgather_pml returned error %d\n",ret)); return NULL; } /*allocate memory to the group_list probably an overestimation of the necessary resources */ module->super.group_list=(int *)malloc(sizeof(int)*cnt); if(NULL == module->super.group_list) { goto Error; } /* figure out who is sharing the same socket */ cnt=0; for (proc = 0; proc < n_local_peers; proc++) { int rem_rank=local_ranks_in_comm[proc]; int rem_socket_index=socket_info[proc]; /*Populate the list*/ if (rem_socket_index == my_socket_index) { module->super.group_list[cnt]=rem_rank; cnt++; } } module->super.group_size=cnt; #if 0 /*debug print*/ { int ii; fprintf(stderr,"Ranks per socket: %d\n",cnt); fprintf(stderr,"Socket %d owns ranks: ", my_socket_index); for (ii=0; ii < cnt; ii++) fprintf(stderr,"%d ",module->super.group_list[ii]); fprintf(stderr,"\n"); fflush(stderr); } #endif /* end debug*/ /*Free resources*/ free(local_ranks_in_comm); free(socket_info); /*Return the module*/ return (mca_sbgp_base_module_t *) module; NoLocalPeers: /* nothing to store, so just free the module and return */ /*fprintf(stderr,"No local socket peers\n");*/ /*free(module);*/ if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; Error: /*clean up*/ if( NULL != module->super.group_list) { free(module->super.group_list); module->super.group_list=NULL; } if(socket_info) { free(socket_info); socket_info=NULL; } if(local_ranks_in_comm) { free(local_ranks_in_comm); } OBJ_RELEASE(module); return NULL; }
/* New init function used for new control scheme where we put the control * struct at the top of the payload buffer */ int bcol_basesmuma_bank_init_opti(struct mca_bcol_base_memory_block_desc_t *payload_block, uint32_t data_offset, mca_bcol_base_module_t *bcol_module, void *reg_data) { /* assumption here is that the block has been registered with * sm bcol hence has been mapped by each process, need to be * sure that memory is mapped amongst sm peers */ /* local variables */ int ret = OMPI_SUCCESS, i, j; sm_buffer_mgmt *pload_mgmt; mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; bcol_basesmuma_registration_data_t *sm_reg_data = (bcol_basesmuma_registration_data_t *) reg_data; mca_bcol_basesmuma_module_t *sm_bcol = (mca_bcol_basesmuma_module_t *) bcol_module; mca_bcol_base_memory_block_desc_t *ml_block = payload_block; size_t malloc_size; bcol_basesmuma_smcm_file_t input_file; int leading_dim,loop_limit,buf_id; unsigned char *base_ptr; mca_bcol_basesmuma_module_t *sm_bcol_module= (mca_bcol_basesmuma_module_t *)bcol_module; int my_idx, array_id; mca_bcol_basesmuma_header_t *ctl_ptr; void **results_array, *mem_offset; mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem; /* first, we get a pointer to the payload buffer management struct */ pload_mgmt = &(sm_bcol->colls_with_user_data); /* go ahead and get the header size that is cached on the payload block */ sm_bcol->total_header_size = data_offset; /* allocate memory for pointers to mine and my peers' payload buffers * difference here is that now we use our new data struct */ malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank* pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t); pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size); if( !pload_mgmt->data_buffs) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit_ERROR; } /* allocate some memory to hold the offsets */ results_array = (void **) malloc(pload_mgmt->size_of_group * sizeof (void *)); /* setup the input file for the shared memory connection manager */ input_file.file_name = sm_reg_data->file_name; input_file.size = sm_reg_data->size; input_file.size_ctl_structure = 0; input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE; input_file.mpool_size = sm_reg_data->size; /* call the connection manager and map my shared memory peers' file */ ret = bcol_basesmuma_smcm_allgather_connection( sm_bcol, sm_bcol->super.sbgp_partner_module, &(cs->sm_connections_list), &(sm_bcol->payload_backing_files_info), sm_bcol->super.sbgp_partner_module->group_comm, input_file,cs->payload_base_fname, false); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } /* now we exchange offset info - don't assume symmetric virtual memory */ mem_offset = (void *) ((uintptr_t) ml_block->block->base_addr - (uintptr_t) cs->sm_payload_structs->data_addr); /* call into the exchange offsets function */ ret=comm_allgather_pml(&mem_offset, results_array, sizeof (void *), MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } /* convert memory offset to virtual address in current rank */ leading_dim = pload_mgmt->size_of_group; loop_limit = ml_block->num_banks*ml_block->num_buffers_per_bank; for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) { /* get the base pointer */ int array_id=SM_ARRAY_INDEX(leading_dim,0,i); if( i == sm_bcol_module->super.sbgp_partner_module->my_index) { /* me */ base_ptr=cs->sm_payload_structs->map_addr; } else { base_ptr=sm_bcol_module->payload_backing_files_info[i]-> sm_mmap->map_addr; } /* first, set the pointer to the control struct */ pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) (uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr); /* second, calculate where to set the data pointer */ pload_mgmt->data_buffs[array_id].payload=(void *) (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + (uint64_t)(uintptr_t) data_offset); for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) { int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i); array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i); /* now, play the same game as above * * first, set the control struct's position */ pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) (uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) + (uint64_t)(uintptr_t)ml_block->size_buffer)); /* second, set the payload pointer */ pload_mgmt->data_buffs[array_id].payload =(void *) (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + (uint64_t)(uintptr_t) data_offset); } } /* done with the index array */ free (results_array); /* initialize my control structures!! */ my_idx = sm_bcol_module->super.sbgp_partner_module->my_index; leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size; for( buf_id = 0; buf_id < loop_limit; buf_id++){ array_id = SM_ARRAY_INDEX(leading_dim,buf_id,my_idx); ctl_ptr = pload_mgmt->data_buffs[array_id].ctl_struct; /* initialize the data structures */ for( j = 0; j < SM_BCOLS_MAX; j++){ for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ ctl_ptr->flags[i][j] = -1; } } ctl_ptr->sequence_number = -1; ctl_ptr->src = -1; } /* setup the data structures needed for releasing the payload * buffers back to the ml level */ for( i=0 ; i < (int) ml_block->num_banks ; i++ ) { sm_bcol->colls_with_user_data. ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor= ml_block; } ml_mem->num_banks = ml_block->num_banks; ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t)); ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank; ml_mem->size_buffer = ml_block->size_buffer; /* pointer to ml level descriptor */ ml_mem->ml_mem_desc = ml_block; if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc, ml_block->block->base_addr, ml_mem->num_banks, ml_mem->num_buffers_per_bank, ml_mem->size_buffer, data_offset, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->pow_k)) { BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n")); return OMPI_ERROR; } return OMPI_SUCCESS; exit_ERROR: return ret; }
/* this is the new one, uses the pml allgather */ int base_bcol_basesmuma_exchange_offsets( mca_bcol_basesmuma_module_t *sm_bcol_module, void **result_array, uint64_t mem_offset, int loop_limit, int leading_dim) { int ret=OMPI_SUCCESS,i; int count; int index_in_group; char *send_buff; char *recv_buff; uint64_t rem_mem_offset; /* malloc some memory */ count = sizeof(uint64_t) + sizeof(int); send_buff = (char *) malloc(count); recv_buff = (char *) malloc(count * sm_bcol_module->super.sbgp_partner_module->group_size); /* exchange the base pointer for the controls structures - gather * every one else's infromation. */ /* pack the offset of the allocated region */ memcpy((void *) send_buff, (void *) &(sm_bcol_module->super.sbgp_partner_module->my_index), sizeof(int)); memcpy((void *) (send_buff+ sizeof(int)), (void *) &(mem_offset), sizeof(uint64_t)); /* get the offsets from all procs, so can setup the control data * structures. */ ret=comm_allgather_pml((void *) send_buff,(void *) recv_buff,count, MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } /* get the control stucture offsets within the shared memory * region and populate the control structures - we do not assume * any symmetry in memory layout of each process */ /* loop over the procs in the group */ for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){ int array_id; /* get this peer's index in the group */ memcpy((void *) &index_in_group, (void *) (recv_buff + i*count) , sizeof(int)); /* get the offset */ memcpy((void *) &rem_mem_offset, (void *) (recv_buff + i*count + sizeof(int)), sizeof(uint64_t)); array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group); result_array[array_id]=(void *)(uintptr_t)rem_mem_offset; } exit_ERROR: /* clean up */ if( NULL != send_buff ) { free(send_buff); send_buff = NULL; } if( NULL != recv_buff ) { free(recv_buff); recv_buff = NULL; } return ret; }
static int base_bcol_basesmuma_exchange_ctl_params( mca_bcol_basesmuma_module_t *sm_bcol_module, mca_bcol_basesmuma_component_t *cs, sm_buffer_mgmt *ctl_mgmt, list_data_t *data_blk) { int ret=OMPI_SUCCESS,i,loop_limit; int leading_dim, buf_id; void *mem_offset; unsigned char *base_ptr; mca_bcol_basesmuma_ctl_struct_t *ctl_ptr; /* data block base offset in the mapped file */ mem_offset = (void *)((uintptr_t)data_blk->data - (uintptr_t)cs->sm_ctl_structs->data_addr); /* number of buffers in data block */ loop_limit=cs->basesmuma_num_mem_banks+ctl_mgmt->number_of_buffs; leading_dim=ctl_mgmt->size_of_group; ret=comm_allgather_pml(&mem_offset, ctl_mgmt->ctl_buffs, sizeof(void *), MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } #if 0 ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module, (void **)ctl_mgmt->ctl_buffs, mem_offset, loop_limit, leading_dim); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } #endif /* convert memory offset to virtual address in current rank */ for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) { /* get the base pointer */ int array_id=SM_ARRAY_INDEX(leading_dim,0,i); if( i == sm_bcol_module->super.sbgp_partner_module->my_index) { /* me */ base_ptr=cs->sm_ctl_structs->map_addr; } else { base_ptr=sm_bcol_module->ctl_backing_files_info[i]->sm_mmap->map_addr; } ctl_mgmt->ctl_buffs[array_id]=(void *) (uintptr_t)(((uint64_t)(uintptr_t)ctl_mgmt->ctl_buffs[array_id])+(uint64_t)(uintptr_t)base_ptr); for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) { int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i); array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i); ctl_mgmt->ctl_buffs[array_id]=(void *) (uintptr_t)((uint64_t)(uintptr_t)(ctl_mgmt->ctl_buffs[array_id_m1])+ (uint64_t)(uintptr_t)sizeof(mca_bcol_basesmuma_ctl_struct_t)); } } /* initialize my control structues */ for( buf_id = 0 ; buf_id < loop_limit ; buf_id++ ) { int my_idx=sm_bcol_module->super.sbgp_partner_module->my_index; int array_id=SM_ARRAY_INDEX(leading_dim,buf_id,my_idx); ctl_ptr = (mca_bcol_basesmuma_ctl_struct_t *) ctl_mgmt->ctl_buffs[array_id]; /* initialize the data structures - RLG, this is only one data * structure that needs to be initialized, more are missing */ ctl_ptr->sequence_number=-1; ctl_ptr->flag=-1; ctl_ptr->index=0; ctl_ptr->src_ptr = NULL; } return ret; exit_ERROR: return ret; }