/* * * Recurssive k-ing algorithm * Example k=3 n=9 * * * Number of Exchange steps = log (basek) n * Number of steps in exchange step = k (radix) * */ int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args, struct mca_bcol_base_function_t *const_args) { /* local variables */ int8_t flag_offset; volatile int8_t ready_flag; mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module; netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree; int group_size = bcol_module->colls_no_user_data.size_of_group; int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */ int bcol_id = (int) bcol_module->super.bcol_id; mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; uint32_t buffer_index = input_args->buffer_index; int *active_requests = &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests); int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration; int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status; int leading_dim, buff_idx, idx; int i, j, probe; int knt; int src; int recv_offset, recv_len; int pow_k, tree_order; int max_requests = 0; /* important to initialize this */ int matched = 0; int64_t sequence_number=input_args->sequence_num; int my_rank = bcol_module->super.sbgp_partner_module->my_index; int buff_offset = bcol_module->super.hier_scather_offset; int pack_len = input_args->count * input_args->dtype->super.size; void *data_addr = (void*)( (unsigned char *) input_args->sbuf + (size_t) input_args->sbuf_offset); volatile mca_bcol_basesmuma_payload_t *data_buffs; volatile char *peer_data_pointer; /* control structures */ volatile mca_bcol_basesmuma_header_t *my_ctl_pointer; volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer; #if 0 fprintf(stderr,"entering p2p allgather pack_len %d\n",pack_len); #endif /* initialize the iteration counter */ buff_idx = input_args->src_desc->buffer_index; leading_dim = bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); data_buffs=(volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs+idx; /* Set pointer to current proc ctrl region */ my_ctl_pointer = data_buffs[my_rank].ctl_struct; /* NTH: copied from progress */ flag_offset = my_ctl_pointer->starting_flag_value[bcol_id]; /* initialize headers and ready flag */ BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); /* initialize these */ *iteration = 0; *active_requests = 0; *status = 0; /* k-nomial parameters */ tree_order = exchange_node->tree_order; pow_k = exchange_node->log_tree_order; /* calculate the maximum number of requests * at each level each rank communicates with * at most (k - 1) peers * so if we set k - 1 bit fields in "max_requests", then * we have max_request == 2^(k - 1) -1 */ for(i = 0; i < (tree_order - 1); i++){ max_requests ^= (1<<i); } /* let's begin the collective, starting with extra ranks and their * respective proxies */ if( EXTRA_NODE == exchange_node->node_type ) { /* then I will signal to my proxy rank*/ my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; ready_flag = flag_offset + 1 + pow_k + 2; /* now, poll for completion */ src = exchange_node->rank_extra_sources_array[0]; peer_data_pointer = data_buffs[src].payload; peer_ctl_pointer = data_buffs[src].ctl_struct; /* calculate the offset */ knt = 0; for(i = 0; i < group_size; i++){ knt += list_connected[i]; } for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) { if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){ matched = 1; /* we receive the entire message */ memcpy((void *)((unsigned char *) data_addr + buff_offset), (void *) ((unsigned char *) peer_data_pointer + buff_offset), knt * pack_len); goto FINISHED; } } /* save state and bail */ *iteration = -1; return BCOL_FN_STARTED; }else if ( 0 < exchange_node->n_extra_sources ) { /* I am a proxy for someone */ src = exchange_node->rank_extra_sources_array[0]; peer_data_pointer = data_buffs[src].payload; peer_ctl_pointer = data_buffs[src].ctl_struct; knt = 0; for(i = 0; i < src; i++){ knt += list_connected[i]; } /* probe for extra rank's arrival */ for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) { if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){ matched = 1; /* copy it in */ memcpy((void *)((unsigned char *) data_addr + knt*pack_len), (void *) ((unsigned char *) peer_data_pointer + knt*pack_len), pack_len * list_connected[src]); goto MAIN_PHASE; } } *status = ready_flag; *iteration = -1; return BCOL_FN_STARTED; } MAIN_PHASE: /* bump the ready flag */ ready_flag++; /* we start the recursive k - ing phase */ for( *iteration = 0; *iteration < pow_k; (*iteration)++) { /* announce my arrival */ opal_atomic_wmb (); my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; /* calculate the number of active requests */ CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order); /* Now post the recv's */ for( j = 0; j < (tree_order - 1); j++ ) { /* recv phase */ src = exchange_node->rank_exchanges[*iteration][j]; if( src < 0 ) { /* then not a valid rank, continue */ continue; } peer_data_pointer = data_buffs[src].payload; peer_ctl_pointer = data_buffs[src].ctl_struct; if( !(*active_requests&(1<<j))) { /* then the bit hasn't been set, thus this peer * hasn't been processed at this level */ recv_offset = exchange_node->payload_info[*iteration][j].r_offset * pack_len; recv_len = exchange_node->payload_info[*iteration][j].r_len * pack_len; /* post the receive */ /* I am putting the probe loop as the inner most loop to achieve * better temporal locality */ matched = 0; for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){ if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){ matched = 1; /* set this request's bit */ *active_requests ^= (1<<j); /* get the data */ memcpy((void *)((unsigned char *) data_addr + recv_offset), (void *)((unsigned char *) peer_data_pointer + recv_offset), recv_len); } } } } if( max_requests == *active_requests ){ /* bump the ready flag */ ready_flag++; /*reset the active requests */ *active_requests = 0; } else { /* save state and hop out * only the iteration needs to be tracked */ *status = my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id]; return BCOL_FN_STARTED; } } /* bump the flag one more time for the extra rank */ ready_flag = flag_offset + 1 + pow_k + 2; /* finish off the last piece, send the data back to the extra */ if( 0 < exchange_node->n_extra_sources ) { /* simply announce my arrival */ opal_atomic_wmb (); my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag; } FINISHED: /* bump this up */ my_ctl_pointer->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; }
static int bcol_basesmuma_fanout_new( bcol_function_args_t *input_args, mca_bcol_base_function_t *c_input_args) { /* local variables */ int64_t sequence_number; mca_bcol_basesmuma_module_t* bcol_module = (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module; int idx, probe, my_rank = bcol_module->super.sbgp_partner_module->my_index, leading_dim = bcol_module->colls_no_user_data.size_of_group; int8_t ready_flag; int8_t bcol_id = (int8_t) bcol_module->super.bcol_id; int buff_index = input_args->buffer_index; mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component; volatile mca_bcol_basesmuma_payload_t *ctl_structs; /* control structures */ volatile mca_bcol_basesmuma_header_t *my_ctl; volatile mca_bcol_basesmuma_header_t *parent_ctl; netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node); /* Figure out - what instance of the basesmuma bcol I am */ sequence_number = input_args->sequence_num; idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0); ctl_structs = (volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs + idx; my_ctl = ctl_structs[my_rank].ctl_struct; /* init the header */ BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id); /* Wait on my parent to arrive */ if (my_tree_node->n_parents) { parent_ctl = ctl_structs[my_tree_node->parent_rank].ctl_struct; for( probe = 0; probe < cm->num_to_probe; probe++){ if (IS_PEER_READY(parent_ctl, ready_flag, sequence_number, BARRIER_FANOUT_FLAG, bcol_id)) { /* signal my children */ my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag; /* bump the starting flag */ my_ctl->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; } } } else { /* I am the root of the fanout */ my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag; /* bump the starting flag */ my_ctl->starting_flag_value[bcol_id]++; return BCOL_FN_COMPLETE; } return BCOL_FN_STARTED; }
/** * Shared memory blocking Broadcast - fanin, for small data buffers. * This routine assumes that buf (the input buffer) is a single writer * multi reader (SWMR) shared memory buffer owned by the calling rank * which is the only rank that can write to this buffers. * It is also assumed that the buffers are registered and fragmented * at the ML level and that buf is sufficiently large to hold the data. * * * @param buf - SWMR shared buffer within a sbgp that the * executing rank can write to. * @param count - the number of elements in the shared buffer. * @param dtype - the datatype of a shared buffer element. * @param root - the index within the sbgp of the root. * @param module - basesmuma module. */ int bcol_basesmuma_bcast(bcol_function_args_t *input_args, coll_ml_function_t *c_input_args) { /* local variables */ int group_size, process_shift, my_node_index; int my_rank; int rc = OMPI_SUCCESS; int my_fanout_parent; int leading_dim, buff_idx, idx; volatile int8_t ready_flag; int count=input_args->count; struct ompi_datatype_t* dtype=input_args->dtype; int root=input_args->root; int64_t sequence_number=input_args->sequence_num; mca_bcol_basesmuma_module_t* bcol_module= (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module; int bcol_id = (int) bcol_module->super.bcol_id; volatile mca_bcol_basesmuma_payload_t *data_buffs; volatile char* parent_data_pointer; mca_bcol_basesmuma_header_t *my_ctl_pointer; volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer; netpatterns_tree_node_t* my_fanout_read_tree; size_t pack_len = 0, dt_size; void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr ); #if 0 fprintf(stderr,"Entering sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset); fflush(stderr); #endif /* we will work only on packed data - so compute the length*/ ompi_datatype_type_size(dtype, &dt_size); pack_len=count*dt_size; buff_idx = input_args->src_desc->buffer_index; /* Get addressing information */ my_rank = bcol_module->super.sbgp_partner_module->my_index; group_size = bcol_module->colls_no_user_data.size_of_group; leading_dim=bcol_module->colls_no_user_data.size_of_group; idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0); data_buffs=(volatile mca_bcol_basesmuma_payload_t *) bcol_module->colls_with_user_data.data_buffs+idx; /* Align node index to around sbgp root */ process_shift = root; my_node_index = my_rank - root; if(0 > my_node_index ) { my_node_index += group_size; } /* get my node for the bcast tree */ my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift; if(group_size <= my_fanout_parent){ my_fanout_parent -= group_size; } /* Set pointer to current proc ctrl region */ /*my_ctl_pointer = ctl_structs[my_rank]; */ my_ctl_pointer = data_buffs[my_rank].ctl_struct; /* setup resource recycling */ BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id); /* * Fan out from root */ if(ROOT_NODE == my_fanout_read_tree->my_node_type) { input_args->result_in_rbuf = false; /* Root should only signal it is ready */ my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) { input_args->result_in_rbuf = false; /* * Get parent payload data and control data. * Get the pointer to the base address of the parent's payload buffer. * Get the parent's control buffer. */ parent_data_pointer = data_buffs[my_fanout_parent].payload; parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; /* Wait until parent signals that data is ready */ /* The order of conditions checked in this loop is important, as it can * result in a race condition. */ while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ opal_progress(); } /* Copy the rank to a shared buffer writable by the current rank */ memcpy(data_addr, (void *)parent_data_pointer, pack_len); if( 0 != rc ) { return OMPI_ERROR; } }else{ input_args->result_in_rbuf = false; /* Interior node */ /* Get parent payload data and control data */ parent_data_pointer = data_buffs[my_fanout_parent].payload; parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; /* Wait until parent signals that data is ready */ /* The order of conditions checked in this loop is important, as it can * result in a race condition. */ while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){ opal_progress(); } /* Copy the rank to a shared buffer writable by the current rank */ memcpy(data_addr, (void *)parent_data_pointer,pack_len); /* Signal to children that they may read the data from my shared buffer */ MB(); my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag; } /* if I am the last instance of a basesmuma function in this collectie, * release the resrouces */ my_ctl_pointer->starting_flag_value[bcol_id]++; return rc; }