/* * Initialize nonblocking barrier. This is code specific for handling * the recycling of data, and uses only a single set of control buffers. * It also assumes that for a given process, only a single outstanding * barrier operation will occur for a given control structure, * with the sequence number being used for potential overlap in time * between succesive barrier calls on different processes. */ int bcol_basesmuma_rd_nb_barrier_init_admin( sm_nbbar_desc_t *sm_desc) { /* local variables */ int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange; int pair_rank; mca_bcol_basesmuma_ctl_struct_t **ctl_structs; netpatterns_pair_exchange_node_t *my_exchange_node; int extra_rank, my_rank; mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl; mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl; int64_t bank_genaration; bool found; int pool_index=sm_desc->pool_index; mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module; /* get the pointer to the segment of control structures */ idx=sm_desc->coll_buff->number_of_buffs+pool_index; leading_dim=sm_desc->coll_buff->size_of_group; idx=SM_ARRAY_INDEX(leading_dim,idx,0); ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **) sm_desc->coll_buff->ctl_buffs+idx; bank_genaration= sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter; my_exchange_node=&(bcol_module->recursive_doubling_tree); my_rank=bcol_module->super.sbgp_partner_module->my_index; my_ctl=ctl_structs[my_rank]; /* debug print */ /* { int ii; for(ii = 0; ii < 6; ii++) { fprintf(stderr,"UUU ctl_struct[%d] := %p\n",ii, bcol_module->colls_no_user_data.ctl_buffs[ii]); fflush(stderr); } } */ /* end debug */ /* signal that I have arrived */ my_ctl->flag = -1; opal_atomic_wmb (); /* don't need to set this flag anymore */ my_ctl->sequence_number = bank_genaration; if(0 < my_exchange_node->n_extra_sources) { if (EXCHANGE_NODE == my_exchange_node->node_type) { volatile int64_t *partner_sn; /* I will participate in the exchange - wait for signal from extra ** process */ extra_rank = my_exchange_node->rank_extra_source; partner_ctl=ctl_structs[extra_rank]; partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); /* spin n iterations until partner registers */ loop_cnt=0; found=false; while( loop_cnt < bcol_module->super.n_poll_loops ) { if( *partner_sn >= bank_genaration ) { found=true; break; } loop_cnt++; } if( !found ) { /* set restart parameters */ sm_desc->collective_phase=NB_PRE_PHASE; return OMPI_SUCCESS; } } else { /* Nothing to do, already registared that I am here */ } } for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) { volatile int64_t *partner_sn; volatile int *partner_flag; /* rank of exchange partner */ pair_rank = my_rank ^ ( 1 SHIFT_UP exchange ); partner_ctl=ctl_structs[pair_rank]; partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); partner_flag=(volatile int *)&(partner_ctl->flag); /* signal that I am at iteration exchange of the algorithm */ my_ctl->flag = exchange; /* check to see if the partner has arrived */ /* spin n iterations until partner registers */ loop_cnt=0; found=false; while( loop_cnt < bcol_module->super.n_poll_loops ) { if( (*partner_sn > bank_genaration) || ( *partner_sn == bank_genaration && *partner_flag >= exchange ) ) { found=true; break; } loop_cnt++; } if( !found ) { /* set restart parameters */ sm_desc->collective_phase=NB_RECURSIVE_DOUBLING; sm_desc->recursive_dbl_iteration=exchange; return OMPI_SUCCESS; } } if(0 < my_exchange_node->n_extra_sources) { if ( EXTRA_NODE == my_exchange_node->node_type ) { volatile int64_t *partner_sn; volatile int *partner_flag; /* I will not participate in the exchange - * wait for signal from extra partner */ extra_rank = my_exchange_node->rank_extra_source; partner_ctl=ctl_structs[extra_rank]; partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); partner_flag=(volatile int *)&(partner_ctl->flag); /* spin n iterations until partner registers */ loop_cnt=0; found=false; while( loop_cnt < bcol_module->super.n_poll_loops ) { if( (*partner_sn > bank_genaration) || ( (*partner_sn == bank_genaration) && (*partner_flag == (my_exchange_node->log_2)) ) ) { found=true; break; } loop_cnt++; } if( !found ) { /* set restart parameters */ sm_desc->collective_phase=NB_POST_PHASE; return OMPI_SUCCESS; } } else { /* signal the extra rank that I am done with the recursive * doubling phase. */ my_ctl->flag = my_exchange_node->n_exchanges; } } /* set the barrier as complete */ sm_desc->collective_phase=NB_BARRIER_DONE; /* return */ return ret; }
static int bcol_basesmuma_fanout_new( bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args) { /* local variables */ int64_t sequence_number; mca_bcol_basesmuma_module_t* bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; int idx, probe, my_rank = bcol_module->super.sbgp_partner_module->my_index, leading_dim = bcol_module->colls_no_user_data.size_of_group; int8_t ready_flag; int8_t bcol_id = (int8_t) bcol_module->super.bcol_id; int buff_index = input_args->buffer_index; mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; volatile mca_bcol_basesmuma_payload_t *ctl_structs; /* control structures */ volatile mca_bcol_basesmuma_header_t *my_ctl; volatile mca_bcol_basesmuma_header_t *parent_ctl; netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node); /* Figure out - what instance of the basesmuma bcol I am */ sequence_number = input_args->sequence_num; idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0); ctl_structs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx; my_ctl = ctl_structs[my_rank].ctl_struct; /* init the header */ BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id); /* Wait on my parent to arrive */ if (my_tree_node->n_parents) { parent_ctl = ctl_structs[my_tree_node->parent_rank].ctl_struct; for( probe = 0; probe < cm->num_to_probe; probe++){ if (IS_PEER_READY(parent_ctl, ready_flag, sequence_number, BARRIER_FANOUT_FLAG, bcol_id)) { /* signal my children */ my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag; /* bump the starting flag */ my_ctl->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; } } } else { /* I am the root of the fanout */ my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag; /* bump the starting flag */ my_ctl->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; } return BCOL_FN_STARTED; }
/* * * Recurssive k-ing algorithm * Example k=3 n=9 * * * Number of Exchange steps = log (basek) n * Number of steps in exchange step = k (radix) * */ int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { /* local variables */ int8_t flag_offset; volatile int8_t ready_flag; mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module; netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; int group_size = bcol_module->colls_no_user_data.size_of_group; int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */ int bcol_id = (int) bcol_module->super.bcol_id; mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; uint32_t buffer_index = input_args->buffer_index; int *active_requests = &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; int leading_dim, buff_idx, idx; int i, j, probe; int knt; int src; int recv_offset, recv_len; int pow_k, tree_order; int max_requests = 0; /* important to initialize this */ int matched = 0; int64_t sequence_number=input_args->sequence_num; int my_rank = bcol_module->super.sbgp_partner_module->my_index; int buff_offset = bcol_module->super.hier_scather_offset; int pack_len = input_args->count * input_args->dtype->super.size; void *data_addr = (void*)( (unsigned char *) input_args->sbuf + (size_t) input_args->sbuf_offset); volatile mca_bcol_basesmuma_payload_t *data_buffs; volatile char *peer_data_pointer; /* control structures */ volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer; #if 0 fprintf(stderr,"entering p2p allgather pack_len %d\n",pack_len); #endif /* initialize the iteration counter */ buff_idx = input_args->src_desc->buffer_index; leading_dim = bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); data_buffs=(volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs+idx; /* Set pointer to current proc ctrl region */ my_ctl_pointer = data_buffs[my_rank].ctl_struct; /* NTH: copied from progress */ flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; /* initialize headers and ready flag */ BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); /* initialize these */ *iteration = 0; *active_requests = 0; *status = 0; /* k-nomial parameters */ tree_order = exchange_node->tree_order; pow_k = exchange_node->log_tree_order; /* calculate the maximum number of requests * at each level each rank communicates with * at most (k - 1) peers * so if we set k - 1 bit fields in "max_requests", then * we have max_request == 2^(k - 1) -1 */ for(i = 0; i < (tree_order - 1); i++){ max_requests ^= (1<<i); } /* let's begin the collective, starting with extra ranks and their * respective proxies */ if( EXTRA_NODE == exchange_node->node_type ) { /* then I will signal to my proxy rank*/ my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; ready_flag = flag_offset + 1 + pow_k + 2; /* now, poll for completion */ src = exchange_node->rank_extra_sources_array[0]; peer_data_pointer = data_buffs[src].payload; peer_ctl_pointer = data_buffs[src].ctl_struct; /* calculate the offset */ knt = 0; for(i = 0; i < group_size; i++){ knt += list_connected[i]; } for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) { if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){ matched = 1; /* we receive the entire message */ memcpy((void *)((unsigned char *) data_addr + buff_offset), (void *) ((unsigned char *) peer_data_pointer + buff_offset), knt * pack_len); goto FINISHED; } } /* save state and bail */ *iteration = -1; return BCOL_FN_STARTED; }else if ( 0 < exchange_node->n_extra_sources ) { /* I am a proxy for someone */ src = exchange_node->rank_extra_sources_array[0]; peer_data_pointer = data_buffs[src].payload; peer_ctl_pointer = data_buffs[src].ctl_struct; knt = 0; for(i = 0; i < src; i++){ knt += list_connected[i]; } /* probe for extra rank's arrival */ for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) { if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){ matched = 1; /* copy it in */ memcpy((void *)((unsigned char *) data_addr + knt*pack_len), (void *) ((unsigned char *) peer_data_pointer + knt*pack_len), pack_len * list_connected[src]); goto MAIN_PHASE; } } *status = ready_flag; *iteration = -1; return BCOL_FN_STARTED; } MAIN_PHASE: /* bump the ready flag */ ready_flag++; /* we start the recursive k - ing phase */ for( *iteration = 0; *iteration < pow_k; (*iteration)++) { /* announce my arrival */ opal_atomic_wmb (); my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; /* calculate the number of active requests */ CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order); /* Now post the recv's */ for( j = 0; j < (tree_order - 1); j++ ) { /* recv phase */ src = exchange_node->rank_exchanges[*iteration][j]; if( src < 0 ) { /* then not a valid rank, continue */ continue; } peer_data_pointer = data_buffs[src].payload; peer_ctl_pointer = data_buffs[src].ctl_struct; if( !(*active_requests&(1<<j))) { /* then the bit hasn't been set, thus this peer * hasn't been processed at this level */ recv_offset = exchange_node->payload_info[*iteration][j].r_offset * pack_len; recv_len = exchange_node->payload_info[*iteration][j].r_len * pack_len; /* post the receive */ /* I am putting the probe loop as the inner most loop to achieve * better temporal locality */ matched = 0; for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){ if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){ matched = 1; /* set this request's bit */ *active_requests ^= (1<<j); /* get the data */ memcpy((void *)((unsigned char *) data_addr + recv_offset), (void *)((unsigned char *) peer_data_pointer + recv_offset), recv_len); } } } } if( max_requests == *active_requests ){ /* bump the ready flag */ ready_flag++; /*reset the active requests */ *active_requests = 0; } else { /* save state and hop out * only the iteration needs to be tracked */ *status = my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id]; return BCOL_FN_STARTED; } } /* bump the flag one more time for the extra rank */ ready_flag = flag_offset + 1 + pow_k + 2; /* finish off the last piece, send the data back to the extra */ if( 0 < exchange_node->n_extra_sources ) { /* simply announce my arrival */ opal_atomic_wmb (); my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; } FINISHED: /* bump this up */ my_ctl_pointer->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; }
/* admin nonblocking barrier - progress function */ int bcol_basesmuma_rd_nb_barrier_progress_admin( sm_nbbar_desc_t *sm_desc) { /* local variables */ int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange; int pair_rank, start_index, restart_phase; mca_bcol_basesmuma_ctl_struct_t **ctl_structs; netpatterns_pair_exchange_node_t *my_exchange_node; int extra_rank, my_rank; mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl; mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl; int64_t bank_genaration; int pool_index=sm_desc->pool_index; bool found; mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module; /* get the pointer to the segment of control structures */ idx = sm_desc->coll_buff->number_of_buffs+pool_index; leading_dim = sm_desc->coll_buff->size_of_group; idx = SM_ARRAY_INDEX(leading_dim,idx,0); ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **) sm_desc->coll_buff->ctl_buffs+idx; bank_genaration = sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter; my_exchange_node=&(bcol_module->recursive_doubling_tree); my_rank=bcol_module->super.sbgp_partner_module->my_index; my_ctl=ctl_structs[my_rank]; /* check to make sure that this should be progressed */ if( ( sm_desc->collective_phase == NB_BARRIER_INACTIVE ) || ( sm_desc->collective_phase == NB_BARRIER_DONE ) ) { return OMPI_SUCCESS; } /* set the restart up - and jump to the correct place in the algorithm */ restart_phase=sm_desc->collective_phase; if ( NB_PRE_PHASE == restart_phase ) { start_index=0; } else if ( NB_RECURSIVE_DOUBLING == restart_phase ) { start_index=sm_desc->recursive_dbl_iteration; goto Exchange_phase; } else { goto Post_phase; } if(0 < my_exchange_node->n_extra_sources) { if (EXCHANGE_NODE == my_exchange_node->node_type) { volatile int64_t *partner_sn; /* I will participate in the exchange - wait for signal from extra ** process */ extra_rank = my_exchange_node->rank_extra_source; partner_ctl=ctl_structs[extra_rank]; partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); /* spin n iterations until partner registers */ loop_cnt=0; while( loop_cnt < bcol_module->super.n_poll_loops ) { found=false; if( *partner_sn >= bank_genaration ) { found=true; break; } loop_cnt++; } if( !found ) { /* set restart parameters */ sm_desc->collective_phase=NB_PRE_PHASE; return OMPI_SUCCESS; } } else { /* Nothing to do, already registared that I am here */ } } Exchange_phase: for(exchange = start_index; exchange < my_exchange_node->n_exchanges; exchange++) { volatile int64_t *partner_sn; volatile int *partner_flag; /* rank of exchange partner */ pair_rank = my_rank ^ ( 1 SHIFT_UP exchange ); partner_ctl=ctl_structs[pair_rank]; partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); partner_flag=(volatile int *)&(partner_ctl->flag); /* signal that I am at iteration exchange of the algorithm */ my_ctl->flag = exchange; /* check to see if the partner has arrived */ /* spin n iterations until partner registers */ loop_cnt=0; found=false; while( loop_cnt < bcol_module->super.n_poll_loops ) { if( (*partner_sn > bank_genaration) || ( (*partner_sn == bank_genaration) && (*partner_flag >= exchange) ) ) { found=true; break; } loop_cnt++; } if( !found ) { /* set restart parameters */ sm_desc->collective_phase=NB_RECURSIVE_DOUBLING; sm_desc->recursive_dbl_iteration=exchange; return OMPI_SUCCESS; } } Post_phase: if(0 < my_exchange_node->n_extra_sources) { if ( EXTRA_NODE == my_exchange_node->node_type ) { volatile int64_t *partner_sn; volatile int *partner_flag; /* I will not participate in the exchange - * wait for signal from extra partner */ extra_rank = my_exchange_node->rank_extra_source; partner_ctl=ctl_structs[extra_rank]; partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number); partner_flag=(volatile int *)&(partner_ctl->flag); /* spin n iterations until partner registers */ loop_cnt=0; found=false; while( loop_cnt < bcol_module->super.n_poll_loops ) { if( (*partner_sn > bank_genaration) || ( *partner_sn == bank_genaration && *partner_flag == (my_exchange_node->log_2) ) ) { found=true; break; } loop_cnt++; } if( !found ) { /* set restart parameters */ sm_desc->collective_phase=NB_POST_PHASE; return OMPI_SUCCESS; } } else { /* signal the extra rank that I am done with the recursive * doubling phase. */ my_ctl->flag = my_exchange_node->n_exchanges; } } /* set the barrier as complete */ sm_desc->collective_phase=NB_BARRIER_DONE; /* return */ return ret; }
/* New init function used for new control scheme where we put the control * struct at the top of the payload buffer */ int bcol_basesmuma_bank_init_opti(struct mca_bcol_base_memory_block_desc_t *payload_block, uint32_t data_offset, mca_bcol_base_module_t *bcol_module, void *reg_data) { /* assumption here is that the block has been registered with * sm bcol hence has been mapped by each process, need to be * sure that memory is mapped amongst sm peers */ /* local variables */ int ret = OMPI_SUCCESS, i, j; sm_buffer_mgmt *pload_mgmt; mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component; bcol_basesmuma_registration_data_t *sm_reg_data = (bcol_basesmuma_registration_data_t *) reg_data; mca_bcol_basesmuma_module_t *sm_bcol = (mca_bcol_basesmuma_module_t *) bcol_module; mca_bcol_base_memory_block_desc_t *ml_block = payload_block; size_t malloc_size; bcol_basesmuma_smcm_file_t input_file; int leading_dim,loop_limit,buf_id; unsigned char *base_ptr; mca_bcol_basesmuma_module_t *sm_bcol_module= (mca_bcol_basesmuma_module_t *)bcol_module; int my_idx, array_id; mca_bcol_basesmuma_header_t *ctl_ptr; void **results_array, *mem_offset; mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem; /* first, we get a pointer to the payload buffer management struct */ pload_mgmt = &(sm_bcol->colls_with_user_data); /* go ahead and get the header size that is cached on the payload block */ sm_bcol->total_header_size = data_offset; /* allocate memory for pointers to mine and my peers' payload buffers * difference here is that now we use our new data struct */ malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank* pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t); pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size); if( !pload_mgmt->data_buffs) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto exit_ERROR; } /* allocate some memory to hold the offsets */ results_array = (void **) malloc(pload_mgmt->size_of_group * sizeof (void *)); /* setup the input file for the shared memory connection manager */ input_file.file_name = sm_reg_data->file_name; input_file.size = sm_reg_data->size; input_file.size_ctl_structure = 0; input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE; input_file.mpool_size = sm_reg_data->size; /* call the connection manager and map my shared memory peers' file */ ret = bcol_basesmuma_smcm_allgather_connection( sm_bcol, sm_bcol->super.sbgp_partner_module, &(cs->sm_connections_list), &(sm_bcol->payload_backing_files_info), sm_bcol->super.sbgp_partner_module->group_comm, input_file,cs->payload_base_fname, false); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } /* now we exchange offset info - don't assume symmetric virtual memory */ mem_offset = (void *) ((uintptr_t) ml_block->block->base_addr - (uintptr_t) cs->sm_payload_structs->data_addr); /* call into the exchange offsets function */ ret=comm_allgather_pml(&mem_offset, results_array, sizeof (void *), MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } /* convert memory offset to virtual address in current rank */ leading_dim = pload_mgmt->size_of_group; loop_limit = ml_block->num_banks*ml_block->num_buffers_per_bank; for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) { /* get the base pointer */ int array_id=SM_ARRAY_INDEX(leading_dim,0,i); if( i == sm_bcol_module->super.sbgp_partner_module->my_index) { /* me */ base_ptr=cs->sm_payload_structs->map_addr; } else { base_ptr=sm_bcol_module->payload_backing_files_info[i]-> sm_mmap->map_addr; } /* first, set the pointer to the control struct */ pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) (uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr); /* second, calculate where to set the data pointer */ pload_mgmt->data_buffs[array_id].payload=(void *) (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + (uint64_t)(uintptr_t) data_offset); for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) { int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i); array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i); /* now, play the same game as above * * first, set the control struct's position */ pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *) (uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) + (uint64_t)(uintptr_t)ml_block->size_buffer)); /* second, set the payload pointer */ pload_mgmt->data_buffs[array_id].payload =(void *) (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct + (uint64_t)(uintptr_t) data_offset); } } /* done with the index array */ free (results_array); /* initialize my control structures!! */ my_idx = sm_bcol_module->super.sbgp_partner_module->my_index; leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size; for( buf_id = 0; buf_id < loop_limit; buf_id++){ array_id = SM_ARRAY_INDEX(leading_dim,buf_id,my_idx); ctl_ptr = pload_mgmt->data_buffs[array_id].ctl_struct; /* initialize the data structures */ for( j = 0; j < SM_BCOLS_MAX; j++){ for( i = 0; i < NUM_SIGNAL_FLAGS; i++){ ctl_ptr->flags[i][j] = -1; } } ctl_ptr->sequence_number = -1; ctl_ptr->src = -1; } /* setup the data structures needed for releasing the payload * buffers back to the ml level */ for( i=0 ; i < (int) ml_block->num_banks ; i++ ) { sm_bcol->colls_with_user_data. ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor= ml_block; } ml_mem->num_banks = ml_block->num_banks; ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t)); ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank; ml_mem->size_buffer = ml_block->size_buffer; /* pointer to ml level descriptor */ ml_mem->ml_mem_desc = ml_block; if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc, ml_block->block->base_addr, ml_mem->num_banks, ml_mem->num_buffers_per_bank, ml_mem->size_buffer, data_offset, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->pow_k)) { BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n")); return OMPI_ERROR; } return OMPI_SUCCESS; exit_ERROR: return ret; }
int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args, struct coll_ml_function_t *const_args) { /* local variables */ int flag_offset; volatile int8_t ready_flag; mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module; mca_common_netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; uint32_t buffer_index = input_args->buffer_index; int *active_requests = &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; int *iter = iteration; /* double alias */ int leading_dim, idx, buff_idx; int i, j, probe; int src; int max_requests = 0; /* critical to set this */ int pow_k, tree_order; int bcol_id = (int) bcol_module->super.bcol_id; int matched = 0; int64_t sequence_number=input_args->sequence_num; int my_rank = bcol_module->super.sbgp_partner_module->my_index; volatile mca_bcol_basesmuma_payload_t *data_buffs; /* control structures */ volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer; #if 0 fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n",my_rank, *active_requests,*iter,*status); #endif buff_idx = buffer_index; leading_dim=bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); data_buffs=(volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs+idx; my_ctl_pointer = data_buffs[my_rank].ctl_struct; /* increment the starting flag by one and return */ flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; ready_flag = *status; /* k-nomial parameters */ tree_order = exchange_node->tree_order; pow_k = exchange_node->log_tree_order; /* calculate the maximum number of requests * at each level each rank communicates with * at most (k - 1) peers * so if we set k - 1 bit fields in "max_requests", then * we have max_request == 2^(k - 1) -1 */ for(i = 0; i < (tree_order - 1); i++){ max_requests ^= (1<<i); } /* let's begin the collective, starting with extra ranks and their * respective proxies */ if( EXTRA_NODE == exchange_node->node_type ) { /* If I'm in here, then I must be looking for data */ ready_flag = flag_offset + 1 + pow_k + 2; src = exchange_node->rank_extra_sources_array[0]; peer_ctl_pointer = data_buffs[src].ctl_struct; for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) { if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; goto FINISHED; } } /* haven't found it, state is cached, bail out */ return BCOL_FN_STARTED; }else if ( ( -1 == *iteration ) && (0 < exchange_node->n_extra_sources) ) { /* I am a proxy for someone */ src = exchange_node->rank_extra_sources_array[0]; peer_ctl_pointer = data_buffs[src].ctl_struct; /* probe for extra rank's arrival */ for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) { if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; /* bump the flag */ ready_flag++; *iteration = 0; goto MAIN_PHASE; } } return BCOL_FN_STARTED; } MAIN_PHASE: /* start the recursive k - ing phase */ for( *iter=*iteration; *iter < pow_k; (*iter)++) { /* I am ready at this level */ my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; if( 0 == *active_requests ) { /* flip some bits, if we don't have active requests from a previous visit */ CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iter],tree_order); } for( j = 0; j < (tree_order - 1); j++ ) { /* recv phase */ src = exchange_node->rank_exchanges[*iter][j]; if( src < 0 ) { /* then not a valid rank, continue */ continue; } peer_ctl_pointer = data_buffs[src].ctl_struct; if( !(*active_requests&(1<<j))){ /* I am putting the probe loop as the inner most loop to achieve * better temporal locality */ matched = 0; for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){ if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){ matched = 1; /* flip the request's bit */ *active_requests ^= (1<<j); } } } } if( max_requests == *active_requests ){ /* bump the ready flag */ ready_flag++; /* reset the active requests for the next level */ *active_requests = 0; /* calculate the number of active requests * logically makes sense to do it here. We don't * want to inadvertantly flip a bit to zero that we * set previously */ } else { /* state is saved hop out */ *status = my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id]; return BCOL_FN_STARTED; } } /* bump the flag one more time for the extra rank */ ready_flag = flag_offset + 1 + pow_k + 2; /* finish off the last piece, send the data back to the extra */ if( 0 < exchange_node->n_extra_sources ) { /* simply announce my arrival */ my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag; } FINISHED: my_ctl_pointer->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; }
/* this is the new one, uses the pml allgather */ int base_bcol_basesmuma_exchange_offsets( mca_bcol_basesmuma_module_t *sm_bcol_module, void **result_array, uint64_t mem_offset, int loop_limit, int leading_dim) { int ret=OMPI_SUCCESS,i; int count; int index_in_group; char *send_buff; char *recv_buff; uint64_t rem_mem_offset; /* malloc some memory */ count = sizeof(uint64_t) + sizeof(int); send_buff = (char *) malloc(count); recv_buff = (char *) malloc(count * sm_bcol_module->super.sbgp_partner_module->group_size); /* exchange the base pointer for the controls structures - gather * every one else's infromation. */ /* pack the offset of the allocated region */ memcpy((void *) send_buff, (void *) &(sm_bcol_module->super.sbgp_partner_module->my_index), sizeof(int)); memcpy((void *) (send_buff+ sizeof(int)), (void *) &(mem_offset), sizeof(uint64_t)); /* get the offsets from all procs, so can setup the control data * structures. */ ret=comm_allgather_pml((void *) send_buff,(void *) recv_buff,count, MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } /* get the control stucture offsets within the shared memory * region and populate the control structures - we do not assume * any symmetry in memory layout of each process */ /* loop over the procs in the group */ for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){ int array_id; /* get this peer's index in the group */ memcpy((void *) &index_in_group, (void *) (recv_buff + i*count) , sizeof(int)); /* get the offset */ memcpy((void *) &rem_mem_offset, (void *) (recv_buff + i*count + sizeof(int)), sizeof(uint64_t)); array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group); result_array[array_id]=(void *)(uintptr_t)rem_mem_offset; } exit_ERROR: /* clean up */ if( NULL != send_buff ) { free(send_buff); send_buff = NULL; } if( NULL != recv_buff ) { free(recv_buff); recv_buff = NULL; } return ret; }
static int base_bcol_basesmuma_exchange_ctl_params( mca_bcol_basesmuma_module_t *sm_bcol_module, mca_bcol_basesmuma_component_t *cs, sm_buffer_mgmt *ctl_mgmt, list_data_t *data_blk) { int ret=OMPI_SUCCESS,i,loop_limit; int leading_dim, buf_id; void *mem_offset; unsigned char *base_ptr; mca_bcol_basesmuma_ctl_struct_t *ctl_ptr; /* data block base offset in the mapped file */ mem_offset = (void *)((uintptr_t)data_blk->data - (uintptr_t)cs->sm_ctl_structs->data_addr); /* number of buffers in data block */ loop_limit=cs->basesmuma_num_mem_banks+ctl_mgmt->number_of_buffs; leading_dim=ctl_mgmt->size_of_group; ret=comm_allgather_pml(&mem_offset, ctl_mgmt->ctl_buffs, sizeof(void *), MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index, sm_bcol_module->super.sbgp_partner_module->group_size, sm_bcol_module->super.sbgp_partner_module->group_list, sm_bcol_module->super.sbgp_partner_module->group_comm); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } #if 0 ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module, (void **)ctl_mgmt->ctl_buffs, mem_offset, loop_limit, leading_dim); if( OMPI_SUCCESS != ret ) { goto exit_ERROR; } #endif /* convert memory offset to virtual address in current rank */ for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) { /* get the base pointer */ int array_id=SM_ARRAY_INDEX(leading_dim,0,i); if( i == sm_bcol_module->super.sbgp_partner_module->my_index) { /* me */ base_ptr=cs->sm_ctl_structs->map_addr; } else { base_ptr=sm_bcol_module->ctl_backing_files_info[i]->sm_mmap->map_addr; } ctl_mgmt->ctl_buffs[array_id]=(void *) (uintptr_t)(((uint64_t)(uintptr_t)ctl_mgmt->ctl_buffs[array_id])+(uint64_t)(uintptr_t)base_ptr); for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) { int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i); array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i); ctl_mgmt->ctl_buffs[array_id]=(void *) (uintptr_t)((uint64_t)(uintptr_t)(ctl_mgmt->ctl_buffs[array_id_m1])+ (uint64_t)(uintptr_t)sizeof(mca_bcol_basesmuma_ctl_struct_t)); } } /* initialize my control structues */ for( buf_id = 0 ; buf_id < loop_limit ; buf_id++ ) { int my_idx=sm_bcol_module->super.sbgp_partner_module->my_index; int array_id=SM_ARRAY_INDEX(leading_dim,buf_id,my_idx); ctl_ptr = (mca_bcol_basesmuma_ctl_struct_t *) ctl_mgmt->ctl_buffs[array_id]; /* initialize the data structures - RLG, this is only one data * structure that needs to be initialized, more are missing */ ctl_ptr->sequence_number=-1; ctl_ptr->flag=-1; ctl_ptr->index=0; ctl_ptr->src_ptr = NULL; } return ret; exit_ERROR: return ret; }
int base_bcol_basesmuma_exchange_offsets( mca_bcol_basesmuma_module_t *sm_bcol_module, void **result_array, uint64_t mem_offset, int loop_limit, int leading_dim) { int ret=OMPI_SUCCESS,i,dummy; int index_in_group, pcnt; opal_list_t peers; ompi_namelist_t *peer; ompi_proc_t *proc_temp, *my_id; opal_buffer_t *send_buffer = OBJ_NEW(opal_buffer_t); opal_buffer_t *recv_buffer = OBJ_NEW(opal_buffer_t); uint64_t rem_mem_offset; /* exchange the base pointer for the controls structures - gather * every one else's infromation. */ /* get list of procs that will participate in the communication */ OBJ_CONSTRUCT(&peers, opal_list_t); for (i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++) { /* get the proc info */ proc_temp = ompi_comm_peer_lookup( sm_bcol_module->super.sbgp_partner_module->group_comm, sm_bcol_module->super.sbgp_partner_module->group_list[i]); peer = OBJ_NEW(ompi_namelist_t); peer->name.jobid = proc_temp->proc_name.jobid; peer->name.vpid = proc_temp->proc_name.vpid; opal_list_append(&peers,&peer->super); /* this is with the new field called "super" in ompi_namelist_t struct */ } /* pack up the data into the allgather send buffer */ if (NULL == send_buffer || NULL == recv_buffer) { opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for sbuffer or rbuffer\n"); ret = OMPI_ERROR; goto exit_ERROR; } /* get my proc information */ my_id = ompi_proc_local(); /* pack my information */ ret = opal_dss.pack(send_buffer, &(sm_bcol_module->super.sbgp_partner_module->my_index),1,OPAL_UINT32); if (OMPI_SUCCESS != ret) { opal_output (ompi_bcol_base_framework.framework_output, "Error packing my_index!!\n"); goto exit_ERROR; } /* pack the offset of the allocated region */ ret = opal_dss.pack(send_buffer,&(mem_offset),1,OPAL_UINT64); if (OMPI_SUCCESS != ret) { goto exit_ERROR; } /* get the offsets from all procs, so can setup the control data * structures. */ if (OMPI_SUCCESS != (ret = ompi_rte_allgather_list(&peers, send_buffer, recv_buffer))) { opal_output (ompi_bcol_base_framework.framework_output, "ompi_rte_allgather_list returned error %d\n", ret); goto exit_ERROR; } /* unpack the dummy */ pcnt=1; ret = opal_dss.unpack(recv_buffer,&dummy, &pcnt, OPAL_INT32); if (OMPI_SUCCESS != ret) { opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for dummy\n",ret); goto exit_ERROR; } /* get the control stucture offsets within the shared memory * region and populate the control structures - we do not assume * any symmetry in memory layout of each process */ /* loop over the procs in the group */ for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){ int array_id; pcnt=1; ret = opal_dss.unpack(recv_buffer,&index_in_group, &pcnt, OPAL_UINT32); if (OMPI_SUCCESS != ret) { opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote index_in_group\n",ret); goto exit_ERROR; } /* get the offset */ pcnt=1; ret = opal_dss.unpack(recv_buffer,&rem_mem_offset, &pcnt, OPAL_UINT64); if (OMPI_SUCCESS != ret) { opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote memory offset\n",ret); goto exit_ERROR; } array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group); result_array[array_id]=(void *)rem_mem_offset; } /* clean up */ peer=(ompi_namelist_t *)opal_list_remove_first(&peers); while( NULL !=peer) { OBJ_RELEASE(peer); peer=(ompi_namelist_t *)opal_list_remove_first(&peers); } OBJ_DESTRUCT(&peers); if( send_buffer ) { OBJ_RELEASE(send_buffer); } if( recv_buffer ) { OBJ_RELEASE(recv_buffer); } return ret; exit_ERROR: /* free peer list */ peer=(ompi_namelist_t *)opal_list_remove_first(&peers); while( NULL !=peer) { OBJ_RELEASE(peer); peer=(ompi_namelist_t *)opal_list_remove_first(&peers); } OBJ_DESTRUCT(&peers); if( send_buffer ) { OBJ_RELEASE(send_buffer); } if( recv_buffer ) { OBJ_RELEASE(recv_buffer); } return ret; }
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args, coll_ml_function_t *c_input_args) { /* local variables */ int group_size, process_shift, my_node_index; int my_rank, first_instance=0, flag_offset; int rc = OMPI_SUCCESS; int my_fanout_parent; int leading_dim, buff_idx, idx; volatile int64_t ready_flag; int count=input_args->count; struct ompi_datatype_t* dtype=input_args->dtype; int root=input_args->root; int64_t sequence_number=input_args->sequence_num; mca_bcol_basesmuma_module_t* bcol_module= (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; netpatterns_tree_node_t* my_fanout_read_tree; size_t pack_len = 0, dt_size; void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr); struct mca_hdl_base_descriptor_t *hdl_desc; struct mca_hdl_base_segment_t *hdl_seg; int ret, completed, ridx/*remote rank index*/; bool status; volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs; mca_bcol_basesmuma_ctl_struct_t *my_ctl_pointer= NULL; volatile mca_bcol_basesmuma_ctl_struct_t *parent_ctl_pointer= NULL; volatile mca_bcol_basesmuma_ctl_struct_t *child_ctl_pointer= NULL; struct mca_hdl_base_module_t* hdl = bcol_module->hdl_module[0]; /* we will work only on packed data - so compute the length*/ ompi_datatype_type_size(dtype, &dt_size); pack_len = count * dt_size; buff_idx = input_args->src_desc->buffer_index; /* Get addressing information */ my_rank = bcol_module->super.sbgp_partner_module->my_index; group_size = bcol_module->colls_no_user_data.size_of_group; leading_dim=bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **) bcol_module->colls_with_user_data.ctl_buffs+idx; my_ctl_pointer = ctl_structs[my_rank]; /* Align node index to around sbgp root */ process_shift = root; my_node_index = my_rank - root; if(0 > my_node_index ) { my_node_index += group_size; } /* get my node for the bcast tree */ my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; if(group_size <= my_fanout_parent){ my_fanout_parent -= group_size; } /* setup resource recycling */ if( my_ctl_pointer->sequence_number < sequence_number ) { first_instance = 1; } if( first_instance ) { /* Signal arrival */ my_ctl_pointer->flag = -1; my_ctl_pointer->index = 1; /* this does not need to use any flag values , so only need to * set the value for subsequent values that may need this */ my_ctl_pointer->starting_flag_value = 0; flag_offset = 0; } else { /* only one thread at a time will be making progress on this * collective, so no need to make this atomic */ my_ctl_pointer->index++; } /* increment the starting flag by one and return */ flag_offset = my_ctl_pointer->starting_flag_value; ready_flag = flag_offset + sequence_number + 1; my_ctl_pointer->sequence_number = sequence_number; hdl_desc = (mca_hdl_base_descriptor_t *) malloc (sizeof (mca_hdl_base_descriptor_t) * 1); /*prepare a hdl data segment*/ hdl_seg = (mca_hdl_base_segment_t*) malloc ( sizeof (mca_hdl_base_segment_t) * 1); hdl_seg->seg_addr.pval = input_args->sbuf; hdl_seg->seg_len = pack_len; hdl->endpoint->ready_flag = ready_flag; hdl->endpoint->local_ctrl = my_ctl_pointer; hdl->endpoint->sbgp_contextid = bcol_module->super.sbgp_partner_module->group_comm->c_contextid; /* * Fan out from root */ if(ROOT_NODE == my_fanout_read_tree->my_node_type) { input_args->result_in_rbuf = false; hdl_desc->des_src = hdl_seg; hdl_desc->des_src_cnt = 1; hdl_desc->isroot = true; /*As the general semantics, there might multiple pairs of send/recv *on the topology tree*/ for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) { child_ctl_pointer = ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; hdl->endpoint->remote_ctrl = child_ctl_pointer; ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc); if (ret != OMPI_SUCCESS) { BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank)); goto exit_ERROR; } } }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) { input_args->result_in_rbuf = false; /* * Get parent payload data and control data. * Get the pointer to the base address of the parent's payload buffer. * Get the parent's control buffer. */ parent_ctl_pointer = ctl_structs[my_fanout_parent]; hdl_desc->des_dst = hdl_seg; hdl_desc->des_dst_cnt = 1; hdl_desc->isroot = false; hdl->endpoint->remote_ctrl = parent_ctl_pointer; #if __TEST_BLOCKING__ ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc); #else ret = hdl->hdl_recvi(hdl, hdl->endpoint, NULL, 0, 0, &hdl_desc); #endif #if __TEST_WAIT__ ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc); BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank)); #endif if (OMPI_SUCCESS != ret) { BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank)); goto exit_ERROR; } status = false; #if __TEST_TEST__ while (!status) { hdl->hdl_test(&hdl_desc, &completed, &status); opal_progress(); BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank)); } #endif goto Release; }else{ input_args->result_in_rbuf = false; /* Interior node */ /* Get parent payload data and control data */ parent_ctl_pointer = ctl_structs[my_fanout_parent]; hdl_desc->des_dst = hdl_seg; hdl_desc->des_dst_cnt = 1; hdl_desc->isroot = false; hdl->endpoint->remote_ctrl = parent_ctl_pointer; ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc); if (OMPI_SUCCESS != ret) { goto exit_ERROR; } if (OMPI_SUCCESS != ret) { BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank)); goto exit_ERROR; } /* Signal to children that they may read the data from my shared buffer */ MB(); hdl_desc->des_src = hdl_seg; hdl_desc->des_src_cnt = 1; for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) { child_ctl_pointer = ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; hdl->endpoint->remote_ctrl = child_ctl_pointer; ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc); if (ret != OMPI_SUCCESS) { BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank)); goto exit_ERROR; } } goto Release; } Release: /* if I am the last instance of a basesmuma function in this collectie, * release the resrouces */ if (IS_LAST_BCOL_FUNC(c_input_args)) { rc = bcol_basesmuma_free_buff( &(bcol_module->colls_with_user_data), sequence_number); } my_ctl_pointer->starting_flag_value += 1; return BCOL_FN_COMPLETE; exit_ERROR: return OMPI_ERROR; }
/** * Shared memory blocking Broadcast - fanin, for small data buffers. * This routine assumes that buf (the input buffer) is a single writer * multi reader (SWMR) shared memory buffer owned by the calling rank * which is the only rank that can write to this buffers. * It is also assumed that the buffers are registered and fragmented * at the ML level and that buf is sufficiently large to hold the data. * * * @param buf - SWMR shared buffer within a sbgp that the * executing rank can write to. * @param count - the number of elements in the shared buffer. * @param dtype - the datatype of a shared buffer element. * @param root - the index within the sbgp of the root. * @param module - basesmuma module. */ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, coll_ml_function_t *c_input_args) { /* local variables */ int group_size, process_shift, my_node_index; int my_rank; int rc = OMPI_SUCCESS; int my_fanout_parent; int leading_dim, buff_idx, idx; volatile int8_t ready_flag; int count=input_args->count; struct ompi_datatype_t* dtype=input_args->dtype; int root=input_args->root; int64_t sequence_number=input_args->sequence_num; mca_bcol_basesmuma_module_t* bcol_module= (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; int bcol_id = (int) bcol_module->super.bcol_id; volatile mca_bcol_basesmuma_payload_t *data_buffs; volatile char* parent_data_pointer; mca_bcol_basesmuma_header_t *my_ctl_pointer; volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer; netpatterns_tree_node_t* my_fanout_read_tree; size_t pack_len = 0, dt_size; void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr ); #if 0 fprintf(stderr,"Entering sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset); fflush(stderr); #endif /* we will work only on packed data - so compute the length*/ ompi_datatype_type_size(dtype, &dt_size); pack_len=count*dt_size; buff_idx = input_args->src_desc->buffer_index; /* Get addressing information */ my_rank = bcol_module->super.sbgp_partner_module->my_index; group_size = bcol_module->colls_no_user_data.size_of_group; leading_dim=bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); data_buffs=(volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs+idx; /* Align node index to around sbgp root */ process_shift = root; my_node_index = my_rank - root; if(0 > my_node_index ) { my_node_index += group_size; } /* get my node for the bcast tree */ my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; if(group_size <= my_fanout_parent){ my_fanout_parent -= group_size; } /* Set pointer to current proc ctrl region */ /*my_ctl_pointer = ctl_structs[my_rank]; */ my_ctl_pointer = data_buffs[my_rank].ctl_struct; /* setup resource recycling */ BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); /* * Fan out from root */ if(ROOT_NODE == my_fanout_read_tree->my_node_type) { input_args->result_in_rbuf = false; /* Root should only signal it is ready */ my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) { input_args->result_in_rbuf = false; /* * Get parent payload data and control data. * Get the pointer to the base address of the parent's payload buffer. * Get the parent's control buffer. */ parent_data_pointer = data_buffs[my_fanout_parent].payload; parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; /* Wait until parent signals that data is ready */ /* The order of conditions checked in this loop is important, as it can * result in a race condition. */ while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ opal_progress(); } /* Copy the rank to a shared buffer writable by the current rank */ memcpy(data_addr, (void *)parent_data_pointer, pack_len); if( 0 != rc ) { return OMPI_ERROR; } }else{ input_args->result_in_rbuf = false; /* Interior node */ /* Get parent payload data and control data */ parent_data_pointer = data_buffs[my_fanout_parent].payload; parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; /* Wait until parent signals that data is ready */ /* The order of conditions checked in this loop is important, as it can * result in a race condition. */ while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ opal_progress(); } /* Copy the rank to a shared buffer writable by the current rank */ memcpy(data_addr, (void *)parent_data_pointer,pack_len); /* Signal to children that they may read the data from my shared buffer */ MB(); my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; } /* if I am the last instance of a basesmuma function in this collectie, * release the resrouces */ my_ctl_pointer->starting_flag_value[bcol_id]++; return rc; }