/*
 * Initialize nonblocking barrier.  This is code specific for handling
 * the recycling of data, and uses only a single set of control buffers.
 * It also assumes that for a given process, only a single outstanding 
 * barrier operation will occur for a given control structure, 
 * with the sequence number being used for potential overlap in time
 * between succesive barrier calls on different processes.
 */
int bcol_basesmuma_rd_nb_barrier_init_admin( 
        sm_nbbar_desc_t *sm_desc)

{
    /* local variables */
    int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange;
    int pair_rank;
    mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
    netpatterns_pair_exchange_node_t *my_exchange_node;
    int extra_rank, my_rank;
    mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl;
    mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl;
    int64_t bank_genaration;
    bool found;
    int pool_index=sm_desc->pool_index;
    mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module;

    /* get the pointer to the segment of control structures */
    idx=sm_desc->coll_buff->number_of_buffs+pool_index;
    leading_dim=sm_desc->coll_buff->size_of_group;
    idx=SM_ARRAY_INDEX(leading_dim,idx,0);
    ctl_structs=(mca_bcol_basesmuma_ctl_struct_t **)
        sm_desc->coll_buff->ctl_buffs+idx;
    bank_genaration= sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter;
    
	my_exchange_node=&(bcol_module->recursive_doubling_tree);
    my_rank=bcol_module->super.sbgp_partner_module->my_index;
    my_ctl=ctl_structs[my_rank];
    /* debug print */
    /*
    {
	    int ii;
	    for(ii = 0; ii < 6; ii++) {
		    fprintf(stderr,"UUU ctl_struct[%d] := %p\n",ii,
			    bcol_module->colls_no_user_data.ctl_buffs[ii]);
		    fflush(stderr);
	    }
    }
    */
    /* end debug */

    /* signal that I have arrived */
    my_ctl->flag = -1;

    opal_atomic_wmb ();

	/* don't need to set this flag anymore */
    my_ctl->sequence_number = bank_genaration;

    if(0 < my_exchange_node->n_extra_sources) {
        if (EXCHANGE_NODE == my_exchange_node->node_type) {
            volatile int64_t *partner_sn;
            /* I will participate in the exchange - wait for signal from extra
             ** process */
            extra_rank = my_exchange_node->rank_extra_source;
            partner_ctl=ctl_structs[extra_rank];
            partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);

            /* spin n iterations until partner registers */
            loop_cnt=0;
            found=false;
            while( loop_cnt < bcol_module->super.n_poll_loops ) 
            {
                if( *partner_sn >= bank_genaration ) {
                    found=true;
                    break;
                }
                loop_cnt++;
            }
            if( !found ) {
                /* set restart parameters */
                sm_desc->collective_phase=NB_PRE_PHASE;
                return OMPI_SUCCESS;
            }

        }  else {

            /* Nothing to do, already registared that I am here */
        }
    }

    for(exchange = 0; exchange < my_exchange_node->n_exchanges; exchange++) {

        volatile int64_t *partner_sn;
        volatile int *partner_flag;

        /* rank of exchange partner */
        pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
        partner_ctl=ctl_structs[pair_rank];
        partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
        partner_flag=(volatile int *)&(partner_ctl->flag);
		
        /* signal that I am at iteration exchange of the algorithm */
        my_ctl->flag = exchange;

        /* check to see if the partner has arrived */

        /* spin n iterations until partner registers */
        loop_cnt=0;
        found=false;
        while( loop_cnt < bcol_module->super.n_poll_loops ) 
        {
            if( (*partner_sn > bank_genaration) ||
                    ( *partner_sn == bank_genaration && 
                      *partner_flag >= exchange ) ) {
                found=true;
                break;
            }
			
             loop_cnt++;
        	
		}
        if( !found ) {
            /* set restart parameters */
            sm_desc->collective_phase=NB_RECURSIVE_DOUBLING;
            sm_desc->recursive_dbl_iteration=exchange;
            return OMPI_SUCCESS;
        }

    }

    if(0 < my_exchange_node->n_extra_sources)  {
        if ( EXTRA_NODE == my_exchange_node->node_type ) {
            volatile int64_t *partner_sn;
            volatile int *partner_flag;

            /* I will not participate in the exchange - 
             *   wait for signal from extra partner */
            extra_rank = my_exchange_node->rank_extra_source;
            partner_ctl=ctl_structs[extra_rank];
            partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
            partner_flag=(volatile int *)&(partner_ctl->flag);

            /* spin n iterations until partner registers */
            loop_cnt=0;
            found=false;
            while( loop_cnt < bcol_module->super.n_poll_loops ) 
            {
                if( (*partner_sn > bank_genaration) ||
                        ( (*partner_sn == bank_genaration) && 
                        (*partner_flag == (my_exchange_node->log_2)) ) ) {
                    found=true;
                    break;
                }
                loop_cnt++;
			}
            if( !found ) {
                /* set restart parameters */
                sm_desc->collective_phase=NB_POST_PHASE;
                return OMPI_SUCCESS;
            }

        }  else {

            /* signal the extra rank that I am done with the recursive
             * doubling phase.
             */
            my_ctl->flag = my_exchange_node->n_exchanges;

        }
    }

    /* set the barrier as complete */
    sm_desc->collective_phase=NB_BARRIER_DONE;
    /* return */
    return ret;
}
static int bcol_basesmuma_fanout_new(
                bcol_function_args_t *input_args,
                mca_bcol_base_function_t *c_input_args)
{
    /* local variables */
    int64_t sequence_number;

    mca_bcol_basesmuma_module_t* bcol_module =
        (mca_bcol_basesmuma_module_t *) c_input_args->bcol_module;

    int idx, probe, 
        my_rank = bcol_module->super.sbgp_partner_module->my_index,
        leading_dim = bcol_module->colls_no_user_data.size_of_group;
    int8_t  ready_flag;
    int8_t bcol_id = (int8_t) bcol_module->super.bcol_id;
    int buff_index = input_args->buffer_index;
    mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;

    
    volatile mca_bcol_basesmuma_payload_t *ctl_structs;

    /* control structures */
    volatile mca_bcol_basesmuma_header_t *my_ctl;
    volatile mca_bcol_basesmuma_header_t *parent_ctl;


    netpatterns_tree_node_t *my_tree_node = &(bcol_module->fanin_node);

    /* Figure out - what instance of the basesmuma bcol I am */
    sequence_number = input_args->sequence_num;

    idx = SM_ARRAY_INDEX(leading_dim, buff_index, 0);
    ctl_structs = (volatile mca_bcol_basesmuma_payload_t *)
                        bcol_module->colls_with_user_data.data_buffs + idx;
    my_ctl = ctl_structs[my_rank].ctl_struct;

    /* init the header */
    BASESMUMA_HEADER_INIT(my_ctl, ready_flag, sequence_number, bcol_id);
    
    /* Wait on my parent to arrive */
    if (my_tree_node->n_parents) {
        parent_ctl = ctl_structs[my_tree_node->parent_rank].ctl_struct;
        for( probe = 0; probe < cm->num_to_probe; probe++){
           if (IS_PEER_READY(parent_ctl, ready_flag, sequence_number, BARRIER_FANOUT_FLAG, bcol_id)) {
              /* signal my children */
               my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag;
               /* bump the starting flag */
               my_ctl->starting_flag_value[bcol_id]++;
               return BCOL_FN_COMPLETE;

            }
        }

    } else {
        /* I am the root of the fanout */
        my_ctl->flags[BARRIER_FANOUT_FLAG][bcol_id] = ready_flag;
        /* bump the starting flag */
        my_ctl->starting_flag_value[bcol_id]++;
        return BCOL_FN_COMPLETE;
    }





    return BCOL_FN_STARTED;
}
Example #3
0
/*
 *
 * Recurssive k-ing algorithm
 * Example k=3 n=9
 *
 *
 * Number of Exchange steps = log (basek) n
 * Number of steps in exchange step = k (radix)
 *
 */
int bcol_basesmuma_k_nomial_allgather_init(bcol_function_args_t *input_args,
                                           struct mca_bcol_base_function_t *const_args)
{
    /* local variables */
    int8_t  flag_offset;
    volatile int8_t ready_flag;
    mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
    netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
    int group_size = bcol_module->colls_no_user_data.size_of_group;
    int *list_connected = bcol_module->super.list_n_connected; /* critical for hierarchical colls */
    int bcol_id = (int) bcol_module->super.bcol_id;
    mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
    uint32_t buffer_index = input_args->buffer_index;
    int *active_requests =
        &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);

    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
    int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
    int leading_dim, buff_idx, idx;

    int i, j, probe;
    int knt;
    int src;
    int recv_offset, recv_len;

    int pow_k, tree_order;
    int max_requests = 0; /* important to initialize this */

    int matched = 0;
    int64_t sequence_number=input_args->sequence_num;
    int my_rank = bcol_module->super.sbgp_partner_module->my_index;
    int buff_offset = bcol_module->super.hier_scather_offset;


    int pack_len = input_args->count * input_args->dtype->super.size;

    void *data_addr = (void*)(
        (unsigned char *) input_args->sbuf +
        (size_t) input_args->sbuf_offset);
    volatile mca_bcol_basesmuma_payload_t *data_buffs;
    volatile char *peer_data_pointer;

    /* control structures */
    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
    volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;

#if 0
    fprintf(stderr,"entering p2p allgather pack_len %d\n",pack_len);
#endif
    /* initialize the iteration counter */
    buff_idx = input_args->src_desc->buffer_index;
    leading_dim = bcol_module->colls_no_user_data.size_of_group;
    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
        bcol_module->colls_with_user_data.data_buffs+idx;

    /* Set pointer to current proc ctrl region */
    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
    /* NTH: copied from progress */
    flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];

    /* initialize headers and ready flag */
    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);

    /* initialize these */
    *iteration = 0;
    *active_requests = 0;
    *status = 0;

    /* k-nomial parameters */
    tree_order = exchange_node->tree_order;
    pow_k = exchange_node->log_tree_order;

    /* calculate the maximum number of requests
     * at each level each rank communicates with
     * at most (k - 1) peers
     * so if we set k - 1 bit fields in "max_requests", then
     * we have max_request  == 2^(k - 1) -1
     */
    for(i = 0; i < (tree_order - 1); i++){
        max_requests ^=  (1<<i);
    }
    /* let's begin the collective, starting with extra ranks and their
     * respective proxies
     */

    if( EXTRA_NODE == exchange_node->node_type ) {

        /* then I will signal to my proxy rank*/
        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
        ready_flag = flag_offset + 1 + pow_k + 2;
        /* now, poll for completion */
        src = exchange_node->rank_extra_sources_array[0];
        peer_data_pointer = data_buffs[src].payload;
        peer_ctl_pointer = data_buffs[src].ctl_struct;

        /* calculate the offset */
        knt = 0;
        for(i = 0; i < group_size; i++){
            knt += list_connected[i];
        }
        for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) {
            if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
                matched = 1;
                /* we receive the entire message */
                memcpy((void *)((unsigned char *) data_addr + buff_offset),
                       (void *) ((unsigned char *) peer_data_pointer + buff_offset),
                       knt * pack_len);

                goto FINISHED;
            }

        }

        /* save state and bail */
        *iteration = -1;
        return BCOL_FN_STARTED;

    }else if ( 0 < exchange_node->n_extra_sources ) {

        /* I am a proxy for someone */
        src = exchange_node->rank_extra_sources_array[0];
        peer_data_pointer = data_buffs[src].payload;
        peer_ctl_pointer = data_buffs[src].ctl_struct;


        knt = 0;
        for(i = 0; i < src; i++){
            knt += list_connected[i];
        }

        /* probe for extra rank's arrival */
        for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) {
            if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
                matched = 1;
                /* copy it in */
                memcpy((void *)((unsigned char *) data_addr + knt*pack_len),
                       (void *) ((unsigned char *) peer_data_pointer + knt*pack_len),
                       pack_len * list_connected[src]);
                goto MAIN_PHASE;
            }
        }
        *status = ready_flag;
        *iteration = -1;
        return BCOL_FN_STARTED;


    }

MAIN_PHASE:
    /* bump the ready flag */
    ready_flag++;


    /* we start the recursive k - ing phase */
    for( *iteration = 0; *iteration < pow_k; (*iteration)++) {
        /* announce my arrival */
        opal_atomic_wmb ();
        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;
        /* calculate the number of active requests */
        CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iteration],tree_order);
        /* Now post the recv's */
        for( j = 0; j < (tree_order - 1); j++ ) {

            /* recv phase */
            src = exchange_node->rank_exchanges[*iteration][j];

            if( src < 0 ) {
                /* then not a valid rank, continue */

                continue;
            }

            peer_data_pointer = data_buffs[src].payload;
            peer_ctl_pointer = data_buffs[src].ctl_struct;
            if( !(*active_requests&(1<<j))) {
                /* then the bit hasn't been set, thus this peer
                 * hasn't been processed at this level
                 */
                recv_offset = exchange_node->payload_info[*iteration][j].r_offset * pack_len;
                recv_len = exchange_node->payload_info[*iteration][j].r_len * pack_len;
                /* post the receive */
                /* I am putting the probe loop as the inner most loop to achieve
                 * better temporal locality
                 */
                matched = 0;
                for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
                    if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, ALLGATHER_FLAG, bcol_id)){
                        matched = 1;
                        /* set this request's bit */
                        *active_requests ^= (1<<j);
                        /* get the data */
                        memcpy((void *)((unsigned char *) data_addr + recv_offset),
                               (void *)((unsigned char *) peer_data_pointer + recv_offset),
                               recv_len);
                    }
                }
            }


        }
        if( max_requests == *active_requests ){
            /* bump the ready flag */
            ready_flag++;
            /*reset the active requests */
            *active_requests = 0;
        } else {
            /* save state and hop out
             * only the iteration needs to be tracked
             */
            *status = my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id];
            return BCOL_FN_STARTED;
        }
    }

    /* bump the flag one more time for the extra rank */
    ready_flag = flag_offset + 1 + pow_k + 2;

    /* finish off the last piece, send the data back to the extra  */
    if( 0 < exchange_node->n_extra_sources ) {
        /* simply announce my arrival */
        opal_atomic_wmb ();
        my_ctl_pointer->flags[ALLGATHER_FLAG][bcol_id] = ready_flag;

    }

FINISHED:
    /* bump this up */
    my_ctl_pointer->starting_flag_value[bcol_id]++;
    return BCOL_FN_COMPLETE;
}
/* admin nonblocking barrier - progress function */
int bcol_basesmuma_rd_nb_barrier_progress_admin( 
        sm_nbbar_desc_t *sm_desc)

{
    /* local variables */
    int ret=OMPI_SUCCESS, idx, leading_dim, loop_cnt, exchange;
    int pair_rank, start_index, restart_phase;
    mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
    netpatterns_pair_exchange_node_t *my_exchange_node;
    int extra_rank, my_rank;
    mca_bcol_basesmuma_ctl_struct_t volatile *partner_ctl;
    mca_bcol_basesmuma_ctl_struct_t volatile *my_ctl;
    int64_t bank_genaration;
    int pool_index=sm_desc->pool_index;
    bool found;
    mca_bcol_basesmuma_module_t *bcol_module=sm_desc->sm_module;

    /* get the pointer to the segment of control structures */
    idx = sm_desc->coll_buff->number_of_buffs+pool_index;
    leading_dim = sm_desc->coll_buff->size_of_group;
    idx = SM_ARRAY_INDEX(leading_dim,idx,0);
    ctl_structs = (mca_bcol_basesmuma_ctl_struct_t **)
        sm_desc->coll_buff->ctl_buffs+idx;
    bank_genaration = sm_desc->coll_buff->ctl_buffs_mgmt[pool_index].bank_gen_counter;

    my_exchange_node=&(bcol_module->recursive_doubling_tree);
    my_rank=bcol_module->super.sbgp_partner_module->my_index;
    my_ctl=ctl_structs[my_rank];

    /* check to make sure that this should be progressed */
    if( ( sm_desc->collective_phase == NB_BARRIER_INACTIVE ) ||
        ( sm_desc->collective_phase == NB_BARRIER_DONE ) ) 
    {
        return OMPI_SUCCESS;
    }

    /* set the restart up - and jump to the correct place in the algorithm */
    restart_phase=sm_desc->collective_phase;
    if ( NB_PRE_PHASE == restart_phase ) {
        start_index=0;
    } else if ( NB_RECURSIVE_DOUBLING == restart_phase ) {
        start_index=sm_desc->recursive_dbl_iteration;
        goto Exchange_phase;
    } else {
        goto Post_phase;
    }

    if(0 < my_exchange_node->n_extra_sources) {
        if (EXCHANGE_NODE == my_exchange_node->node_type) {
            volatile int64_t *partner_sn;
            /* I will participate in the exchange - wait for signal from extra
             ** process */
            extra_rank = my_exchange_node->rank_extra_source;
            partner_ctl=ctl_structs[extra_rank];
            partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);

            /* spin n iterations until partner registers */
            loop_cnt=0;
            while( loop_cnt < bcol_module->super.n_poll_loops ) 
            {
                found=false;
                if( *partner_sn >= bank_genaration ) {
                    found=true;
                    break;
                }
                loop_cnt++;
            }
            if( !found ) {
                /* set restart parameters */
                sm_desc->collective_phase=NB_PRE_PHASE;
                return OMPI_SUCCESS;
            }

        }  else {

            /* Nothing to do, already registared that I am here */
        }
    }

Exchange_phase:

    for(exchange = start_index; 
        exchange < my_exchange_node->n_exchanges; exchange++) {

        volatile int64_t *partner_sn;
        volatile int *partner_flag;

        /* rank of exchange partner */
        pair_rank = my_rank ^ ( 1 SHIFT_UP exchange );
        partner_ctl=ctl_structs[pair_rank];
        partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
        partner_flag=(volatile int *)&(partner_ctl->flag);

        /* signal that I am at iteration exchange of the algorithm */
        my_ctl->flag = exchange;

        /* check to see if the partner has arrived */

        /* spin n iterations until partner registers */
        loop_cnt=0;
        found=false;
        while( loop_cnt < bcol_module->super.n_poll_loops ) 
        {
            if( (*partner_sn > bank_genaration) ||
                    ( (*partner_sn == bank_genaration) && 
                      (*partner_flag >= exchange) ) ) {
                found=true;
                break;
            }
            loop_cnt++;
        }
        if( !found ) {
            /* set restart parameters */
            sm_desc->collective_phase=NB_RECURSIVE_DOUBLING;
            sm_desc->recursive_dbl_iteration=exchange;
            return OMPI_SUCCESS;
        }

    }

Post_phase:
    if(0 < my_exchange_node->n_extra_sources)  {
        if ( EXTRA_NODE == my_exchange_node->node_type ) {
            volatile int64_t *partner_sn;
            volatile int *partner_flag;

            /* I will not participate in the exchange - 
             *   wait for signal from extra partner */
            extra_rank = my_exchange_node->rank_extra_source;
            partner_ctl=ctl_structs[extra_rank];
            partner_sn=(volatile int64_t *)&(partner_ctl->sequence_number);
            partner_flag=(volatile int *)&(partner_ctl->flag);

            /* spin n iterations until partner registers */
            loop_cnt=0;
            found=false;
            while( loop_cnt < bcol_module->super.n_poll_loops ) 
            {
                if( (*partner_sn > bank_genaration) ||
                        ( *partner_sn == bank_genaration && 
                        *partner_flag == (my_exchange_node->log_2) ) ) {
                    found=true;
                    break;
                }
                loop_cnt++;
            }
            if( !found ) {
                /* set restart parameters */
                sm_desc->collective_phase=NB_POST_PHASE;
                return OMPI_SUCCESS;
            }

        }  else {

            /* signal the extra rank that I am done with the recursive
             * doubling phase.
             */
            my_ctl->flag = my_exchange_node->n_exchanges;

        }
    }

    /* set the barrier as complete */
    sm_desc->collective_phase=NB_BARRIER_DONE;

    /* return */
    return ret;
}
Example #5
0
/* New init function used for new control scheme where we put the control
 * struct at the top of the payload buffer
 */
int bcol_basesmuma_bank_init_opti(struct mca_bcol_base_memory_block_desc_t *payload_block,
        uint32_t data_offset,
        mca_bcol_base_module_t *bcol_module,
        void *reg_data)
{
    /* assumption here is that the block has been registered with
     * sm bcol hence has been mapped by each process, need to be
     * sure that memory is mapped amongst sm peers
     */

    /* local variables */
    int ret = OMPI_SUCCESS, i, j;
    sm_buffer_mgmt *pload_mgmt;
    mca_bcol_basesmuma_component_t *cs = &mca_bcol_basesmuma_component;
    bcol_basesmuma_registration_data_t *sm_reg_data =
        (bcol_basesmuma_registration_data_t *) reg_data;
    mca_bcol_basesmuma_module_t *sm_bcol =
        (mca_bcol_basesmuma_module_t *) bcol_module;
    mca_bcol_base_memory_block_desc_t *ml_block = payload_block;
    size_t malloc_size;
    bcol_basesmuma_smcm_file_t input_file;
    int leading_dim,loop_limit,buf_id;
    unsigned char *base_ptr;
    mca_bcol_basesmuma_module_t *sm_bcol_module=
        (mca_bcol_basesmuma_module_t *)bcol_module;
    int my_idx, array_id;
    mca_bcol_basesmuma_header_t *ctl_ptr;
    void **results_array, *mem_offset;

    mca_bcol_basesmuma_local_mlmem_desc_t *ml_mem = &sm_bcol_module->ml_mem;

    /* first, we get a pointer to the payload buffer management struct */
    pload_mgmt = &(sm_bcol->colls_with_user_data);

    /* go ahead and get the header size that is cached on the payload block
     */
    sm_bcol->total_header_size = data_offset;

    /* allocate memory for pointers to mine and my peers' payload buffers
     * difference here is that now we use our new data struct
     */
    malloc_size = ml_block->num_banks*ml_block->num_buffers_per_bank*
        pload_mgmt->size_of_group *sizeof(mca_bcol_basesmuma_payload_t);
    pload_mgmt->data_buffs = (mca_bcol_basesmuma_payload_t *) malloc(malloc_size);
    if( !pload_mgmt->data_buffs) {
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit_ERROR;
    }

    /* allocate some memory to hold the offsets */
    results_array = (void **) malloc(pload_mgmt->size_of_group * sizeof (void *));

    /* setup the input file for the shared memory connection manager */
    input_file.file_name = sm_reg_data->file_name;
    input_file.size = sm_reg_data->size;
    input_file.size_ctl_structure = 0;
    input_file.data_seg_alignment = BASESMUMA_CACHE_LINE_SIZE;
    input_file.mpool_size = sm_reg_data->size;

    /* call the connection manager and map my shared memory peers' file
     */
    ret = bcol_basesmuma_smcm_allgather_connection(
        sm_bcol,
        sm_bcol->super.sbgp_partner_module,
        &(cs->sm_connections_list),
        &(sm_bcol->payload_backing_files_info),
        sm_bcol->super.sbgp_partner_module->group_comm,
        input_file,cs->payload_base_fname,
        false);
    if( OMPI_SUCCESS != ret ) {
        goto exit_ERROR;
    }


    /* now we exchange offset info - don't assume symmetric virtual memory
     */

    mem_offset = (void *) ((uintptr_t) ml_block->block->base_addr -
                           (uintptr_t) cs->sm_payload_structs->data_addr);

    /* call into the exchange offsets function */
    ret=comm_allgather_pml(&mem_offset, results_array, sizeof (void *), MPI_BYTE,
                           sm_bcol_module->super.sbgp_partner_module->my_index,
                           sm_bcol_module->super.sbgp_partner_module->group_size,
                           sm_bcol_module->super.sbgp_partner_module->group_list,
                           sm_bcol_module->super.sbgp_partner_module->group_comm);
    if( OMPI_SUCCESS != ret ) {
        goto exit_ERROR;
    }

    /* convert memory offset to virtual address in current rank */
    leading_dim = pload_mgmt->size_of_group;
    loop_limit =  ml_block->num_banks*ml_block->num_buffers_per_bank;
    for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {

        /* get the base pointer */
        int array_id=SM_ARRAY_INDEX(leading_dim,0,i);
        if( i == sm_bcol_module->super.sbgp_partner_module->my_index) {
            /* me */
            base_ptr=cs->sm_payload_structs->map_addr;
        } else {
            base_ptr=sm_bcol_module->payload_backing_files_info[i]->
                sm_mmap->map_addr;
        }

        /* first, set the pointer to the control struct */
        pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
            (uintptr_t)(((uint64_t)(uintptr_t)results_array[array_id])+(uint64_t)(uintptr_t)base_ptr);
        /* second, calculate where to set the data pointer */
        pload_mgmt->data_buffs[array_id].payload=(void *)
            (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
                        (uint64_t)(uintptr_t) data_offset);

        for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
            int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
            array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
            /* now, play the same game as above
             *
             * first, set the control struct's position */
            pload_mgmt->data_buffs[array_id].ctl_struct=(mca_bcol_basesmuma_header_t *)
                (uintptr_t)(((uint64_t)(uintptr_t)(pload_mgmt->data_buffs[array_id_m1].ctl_struct) +
                             (uint64_t)(uintptr_t)ml_block->size_buffer));

            /* second, set the payload pointer */
            pload_mgmt->data_buffs[array_id].payload =(void *)
                (uintptr_t)((uint64_t)(uintptr_t) pload_mgmt->data_buffs[array_id].ctl_struct +
                            (uint64_t)(uintptr_t) data_offset);
        }

    }

    /* done with the index array */
    free (results_array);

    /* initialize my control structures!! */
    my_idx = sm_bcol_module->super.sbgp_partner_module->my_index;
    leading_dim = sm_bcol_module->super.sbgp_partner_module->group_size;
    for( buf_id = 0; buf_id < loop_limit; buf_id++){
        array_id = SM_ARRAY_INDEX(leading_dim,buf_id,my_idx);
        ctl_ptr = pload_mgmt->data_buffs[array_id].ctl_struct;

        /* initialize the data structures */
        for( j = 0; j < SM_BCOLS_MAX; j++){
            for( i = 0; i < NUM_SIGNAL_FLAGS; i++){
                ctl_ptr->flags[i][j] = -1;
            }
        }
        ctl_ptr->sequence_number = -1;
        ctl_ptr->src = -1;
    }




    /* setup the data structures needed for releasing the payload
     * buffers back to the ml level
     */
    for( i=0 ; i < (int) ml_block->num_banks ; i++ ) {
        sm_bcol->colls_with_user_data.
            ctl_buffs_mgmt[i].nb_barrier_desc.ml_memory_block_descriptor=
            ml_block;
    }

    ml_mem->num_banks = ml_block->num_banks;
    ml_mem->bank_release_counter = calloc(ml_block->num_banks, sizeof(uint32_t));
    ml_mem->num_buffers_per_bank = ml_block->num_buffers_per_bank;
    ml_mem->size_buffer = ml_block->size_buffer;
    /* pointer to ml level descriptor */
    ml_mem->ml_mem_desc = ml_block;

    if (OMPI_SUCCESS != init_nb_coll_buff_desc(&ml_mem->nb_coll_desc,
                                               ml_block->block->base_addr,
                                               ml_mem->num_banks,
                                               ml_mem->num_buffers_per_bank,
                                               ml_mem->size_buffer,
                                               data_offset,
                                               sm_bcol_module->super.sbgp_partner_module->group_size,
                                               sm_bcol_module->pow_k)) {

        BASESMUMA_VERBOSE(10, ("Failed to allocate memory descriptors for storing state of non-blocking collectives\n"));
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;

exit_ERROR:
    return ret;
}
int bcol_basesmuma_k_nomial_barrier_progress(bcol_function_args_t *input_args,
                        struct coll_ml_function_t *const_args)
{


    /* local variables */
    int flag_offset;
    volatile int8_t ready_flag;
    mca_bcol_basesmuma_module_t *bcol_module = (mca_bcol_basesmuma_module_t *) const_args->bcol_module;
    mca_common_netpatterns_k_exchange_node_t *exchange_node = &bcol_module->knomial_allgather_tree;
    mca_bcol_basesmuma_component_t *cm = &mca_bcol_basesmuma_component;
    uint32_t buffer_index = input_args->buffer_index;
    int *active_requests =
        &(bcol_module->ml_mem.nb_coll_desc[buffer_index].active_requests);

    int *iteration = &bcol_module->ml_mem.nb_coll_desc[buffer_index].iteration;
    int *status = &bcol_module->ml_mem.nb_coll_desc[buffer_index].status;
    int *iter = iteration; /* double alias */
    int leading_dim, idx, buff_idx;

    int i, j, probe;
    int src;
    int max_requests = 0; /* critical to set this */
    int pow_k, tree_order;
    int bcol_id = (int) bcol_module->super.bcol_id;
    
    int matched = 0;
    int64_t sequence_number=input_args->sequence_num;
    int my_rank = bcol_module->super.sbgp_partner_module->my_index;

    volatile mca_bcol_basesmuma_payload_t *data_buffs;

    /* control structures */
    volatile mca_bcol_basesmuma_header_t *my_ctl_pointer;
    volatile mca_bcol_basesmuma_header_t *peer_ctl_pointer;
#if 0 
    fprintf(stderr,"%d: entering sm allgather progress active requests %d iter %d ready_flag %d\n",my_rank,
            *active_requests,*iter,*status);
#endif
    buff_idx = buffer_index; 
    leading_dim=bcol_module->colls_no_user_data.size_of_group; 
    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
  
    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
                bcol_module->colls_with_user_data.data_buffs+idx;
    my_ctl_pointer = data_buffs[my_rank].ctl_struct;
    
    /* increment the starting flag by one and return */
    flag_offset = my_ctl_pointer->starting_flag_value[bcol_id];
    ready_flag = *status;
    /* k-nomial parameters */
    tree_order = exchange_node->tree_order;
    pow_k = exchange_node->log_tree_order;
    
    /* calculate the maximum number of requests 
     * at each level each rank communicates with 
     * at most (k - 1) peers 
     * so if we set k - 1 bit fields in "max_requests", then 
     * we have max_request  == 2^(k - 1) -1
     */
    for(i = 0; i < (tree_order - 1); i++){
        max_requests ^= (1<<i);
    }

    /* let's begin the collective, starting with extra ranks and their
     * respective proxies
     */

    if( EXTRA_NODE == exchange_node->node_type ) {

        /* If I'm in here, then I must be looking for data */
        ready_flag = flag_offset + 1 + pow_k + 2;

        src = exchange_node->rank_extra_sources_array[0];
        peer_ctl_pointer = data_buffs[src].ctl_struct;

        for( i = 0; i < cm->num_to_probe && (0 == matched); i++ ) {
            if(IS_PEER_READY(peer_ctl_pointer, ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
                matched = 1;
                
                goto FINISHED;
            } 

        }

        /* haven't found it, state is cached, bail out */
        return BCOL_FN_STARTED;

    }else if ( ( -1 == *iteration ) && (0 < exchange_node->n_extra_sources) ) {

        /* I am a proxy for someone */
        src = exchange_node->rank_extra_sources_array[0];
        peer_ctl_pointer = data_buffs[src].ctl_struct;
        
        /* probe for extra rank's arrival */
        for( i = 0; i < cm->num_to_probe && ( 0 == matched); i++) {
            if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
                matched = 1;
                /* bump the flag */
                ready_flag++;
                *iteration = 0;
                goto MAIN_PHASE;
            } 
        }
        return BCOL_FN_STARTED;

    }

MAIN_PHASE:

    /* start the recursive k - ing phase */
    for( *iter=*iteration; *iter < pow_k; (*iter)++) {
        /* I am ready at this level */
        my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;
        if( 0 == *active_requests ) {
            /* flip some bits, if we don't have active requests from a previous visit */
            CALC_ACTIVE_REQUESTS(active_requests,exchange_node->rank_exchanges[*iter],tree_order);
        }
        for( j = 0; j < (tree_order - 1); j++ ) {
            
            /* recv phase */
            src = exchange_node->rank_exchanges[*iter][j];
            if( src < 0 ) {
                /* then not a valid rank, continue  
                 */
                continue;
            }

            peer_ctl_pointer = data_buffs[src].ctl_struct;
            if( !(*active_requests&(1<<j))){

                /* I am putting the probe loop as the inner most loop to achieve
                 * better temporal locality 
                 */
                matched = 0;
                for( probe = 0; probe < cm->num_to_probe && (0 == matched); probe++){
                    if(IS_PEER_READY(peer_ctl_pointer,ready_flag, sequence_number, BARRIER_RKING_FLAG, bcol_id)){
                        matched = 1;
                        /* flip the request's bit */
                        *active_requests ^= (1<<j);
                    }
                }
            } 


        }
        if( max_requests == *active_requests ){
            /* bump the ready flag */
            ready_flag++;
            /* reset the active requests for the next level */
            *active_requests = 0;
            /* calculate the number of active requests 
             * logically makes sense to do it here. We don't 
             * want to inadvertantly flip a bit to zero that we 
             * set previously
             */
        } else {
            /* state is saved hop out
             */
            *status = my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id];
            return BCOL_FN_STARTED;
        }
    }
    /* bump the flag one more time for the extra rank */
    ready_flag = flag_offset + 1 + pow_k + 2;

    /* finish off the last piece, send the data back to the extra  */
    if( 0 < exchange_node->n_extra_sources ) {
        /* simply announce my arrival */
        my_ctl_pointer->flags[BARRIER_RKING_FLAG][bcol_id] = ready_flag;

    }

FINISHED:
  
    my_ctl_pointer->starting_flag_value[bcol_id]++;
    return BCOL_FN_COMPLETE;
}
Example #7
0
/* this is the new one, uses the pml allgather */
int base_bcol_basesmuma_exchange_offsets(
    mca_bcol_basesmuma_module_t *sm_bcol_module,
    void **result_array, uint64_t mem_offset, int loop_limit,
    int leading_dim)
{
    int ret=OMPI_SUCCESS,i;
    int count;
    int index_in_group;
    char *send_buff;
    char *recv_buff;
    uint64_t rem_mem_offset;

    /* malloc some memory */
    count = sizeof(uint64_t) + sizeof(int);
    send_buff = (char *) malloc(count);
    recv_buff = (char *) malloc(count *
                           sm_bcol_module->super.sbgp_partner_module->group_size);
    /*  exchange the base pointer for the controls structures - gather
     *  every one else's infromation.
     */


    /* pack the offset of the allocated region */
    memcpy((void *) send_buff, (void *) &(sm_bcol_module->super.sbgp_partner_module->my_index), sizeof(int));
    memcpy((void *) (send_buff+ sizeof(int)), (void *) &(mem_offset), sizeof(uint64_t));

    /* get the offsets from all procs, so can setup the control data
     * structures.
     */

    ret=comm_allgather_pml((void *) send_buff,(void *) recv_buff,count,
            MPI_BYTE,
            sm_bcol_module->super.sbgp_partner_module->my_index,
            sm_bcol_module->super.sbgp_partner_module->group_size,
            sm_bcol_module->super.sbgp_partner_module->group_list,
            sm_bcol_module->super.sbgp_partner_module->group_comm);
    if( OMPI_SUCCESS != ret ) {
        goto exit_ERROR;
    }

    /* get the control stucture offsets within the shared memory
     *   region and populate the control structures - we do not assume
     *   any symmetry in memory layout of each process
     */

    /* loop over the procs in the group */
    for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
        int array_id;
        /* get this peer's index in the group */
        memcpy((void *) &index_in_group, (void *) (recv_buff + i*count) , sizeof(int));

        /* get the offset */
        memcpy((void *) &rem_mem_offset, (void *) (recv_buff + i*count + sizeof(int)), sizeof(uint64_t));

        array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
        result_array[array_id]=(void *)(uintptr_t)rem_mem_offset;

    }

exit_ERROR:
    /* clean up */
    if( NULL != send_buff ) {
        free(send_buff);
        send_buff = NULL;
    }
    if( NULL != recv_buff ) {
        free(recv_buff);
        recv_buff = NULL;
    }

    return ret;


}
Example #8
0
static int base_bcol_basesmuma_exchange_ctl_params(
    mca_bcol_basesmuma_module_t *sm_bcol_module,
    mca_bcol_basesmuma_component_t *cs,
    sm_buffer_mgmt *ctl_mgmt, list_data_t *data_blk)
{
    int ret=OMPI_SUCCESS,i,loop_limit;
    int leading_dim, buf_id;
    void *mem_offset;
    unsigned char *base_ptr;
    mca_bcol_basesmuma_ctl_struct_t *ctl_ptr;

    /* data block base offset in the mapped file */
    mem_offset = (void *)((uintptr_t)data_blk->data -
                          (uintptr_t)cs->sm_ctl_structs->data_addr);

    /* number of buffers in data block */
    loop_limit=cs->basesmuma_num_mem_banks+ctl_mgmt->number_of_buffs;
    leading_dim=ctl_mgmt->size_of_group;
    ret=comm_allgather_pml(&mem_offset, ctl_mgmt->ctl_buffs, sizeof(void *),
                           MPI_BYTE, sm_bcol_module->super.sbgp_partner_module->my_index,
                           sm_bcol_module->super.sbgp_partner_module->group_size,
                           sm_bcol_module->super.sbgp_partner_module->group_list,
                           sm_bcol_module->super.sbgp_partner_module->group_comm);
    if( OMPI_SUCCESS != ret ) {
        goto exit_ERROR;
    }

#if 0
    ret=base_bcol_basesmuma_exchange_offsets( sm_bcol_module,
            (void **)ctl_mgmt->ctl_buffs, mem_offset, loop_limit, leading_dim);
    if( OMPI_SUCCESS != ret ) {
        goto exit_ERROR;
    }
#endif

    /* convert memory offset to virtual address in current rank */
    for (i=0;i< sm_bcol_module->super.sbgp_partner_module->group_size;i++) {

        /* get the base pointer */
        int array_id=SM_ARRAY_INDEX(leading_dim,0,i);
        if( i == sm_bcol_module->super.sbgp_partner_module->my_index) {
            /* me */
            base_ptr=cs->sm_ctl_structs->map_addr;
        } else {
            base_ptr=sm_bcol_module->ctl_backing_files_info[i]->sm_mmap->map_addr;
        }
        ctl_mgmt->ctl_buffs[array_id]=(void *)
            (uintptr_t)(((uint64_t)(uintptr_t)ctl_mgmt->ctl_buffs[array_id])+(uint64_t)(uintptr_t)base_ptr);
        for( buf_id = 1 ; buf_id < loop_limit ; buf_id++ ) {
            int array_id_m1=SM_ARRAY_INDEX(leading_dim,(buf_id-1),i);
            array_id=SM_ARRAY_INDEX(leading_dim,buf_id,i);
            ctl_mgmt->ctl_buffs[array_id]=(void *) (uintptr_t)((uint64_t)(uintptr_t)(ctl_mgmt->ctl_buffs[array_id_m1])+
                (uint64_t)(uintptr_t)sizeof(mca_bcol_basesmuma_ctl_struct_t));
        }
    }
    /* initialize my control structues */
    for( buf_id = 0 ; buf_id < loop_limit ; buf_id++ ) {

        int my_idx=sm_bcol_module->super.sbgp_partner_module->my_index;
        int array_id=SM_ARRAY_INDEX(leading_dim,buf_id,my_idx);
        ctl_ptr = (mca_bcol_basesmuma_ctl_struct_t *)
                ctl_mgmt->ctl_buffs[array_id];

        /* initialize the data structures - RLG, this is only one data
         * structure that needs to be initialized, more are missing */
        ctl_ptr->sequence_number=-1;
        ctl_ptr->flag=-1;
        ctl_ptr->index=0;
        ctl_ptr->src_ptr = NULL;
    }

    return ret;

exit_ERROR:

    return ret;
}
Example #9
0
int base_bcol_basesmuma_exchange_offsets(
    mca_bcol_basesmuma_module_t *sm_bcol_module,
    void **result_array, uint64_t mem_offset, int loop_limit,
    int leading_dim)
{
    int ret=OMPI_SUCCESS,i,dummy;
    int index_in_group, pcnt;
    opal_list_t peers;
    ompi_namelist_t *peer;
    ompi_proc_t *proc_temp, *my_id;
    opal_buffer_t *send_buffer = OBJ_NEW(opal_buffer_t);
    opal_buffer_t *recv_buffer = OBJ_NEW(opal_buffer_t);
    uint64_t rem_mem_offset;

    /*  exchange the base pointer for the controls structures - gather
     *  every one else's infromation.
     */
    /* get list of procs that will participate in the communication */
    OBJ_CONSTRUCT(&peers, opal_list_t);
    for (i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++) {
        /* get the proc info */
        proc_temp = ompi_comm_peer_lookup(
                sm_bcol_module->super.sbgp_partner_module->group_comm,
                sm_bcol_module->super.sbgp_partner_module->group_list[i]);
        peer = OBJ_NEW(ompi_namelist_t);
        peer->name.jobid = proc_temp->proc_name.jobid;
        peer->name.vpid = proc_temp->proc_name.vpid;
        opal_list_append(&peers,&peer->super); /* this is with the new field called "super" in ompi_namelist_t struct */
    }
    /* pack up the data into the allgather send buffer */
        if (NULL == send_buffer || NULL == recv_buffer) {
            opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for sbuffer or rbuffer\n");
            ret = OMPI_ERROR;
            goto exit_ERROR;
        }

    /* get my proc information */
    my_id = ompi_proc_local();

    /* pack my information */
    ret = opal_dss.pack(send_buffer,
        &(sm_bcol_module->super.sbgp_partner_module->my_index),1,OPAL_UINT32);

    if (OMPI_SUCCESS != ret) {
        opal_output (ompi_bcol_base_framework.framework_output, "Error packing my_index!!\n");
        goto exit_ERROR;
    }

    /* pack the offset of the allocated region */
    ret = opal_dss.pack(send_buffer,&(mem_offset),1,OPAL_UINT64);
    if (OMPI_SUCCESS != ret) {
        goto exit_ERROR;
    }

    /* get the offsets from all procs, so can setup the control data
     * structures.
     */
    if (OMPI_SUCCESS != (ret = ompi_rte_allgather_list(&peers, send_buffer, recv_buffer))) {
        opal_output (ompi_bcol_base_framework.framework_output, "ompi_rte_allgather_list returned error %d\n", ret);
        goto exit_ERROR;
    }

        /* unpack the dummy */
        pcnt=1;
        ret = opal_dss.unpack(recv_buffer,&dummy, &pcnt, OPAL_INT32);
        if (OMPI_SUCCESS != ret) {
                opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for dummy\n",ret);
                goto exit_ERROR;
        }

    /* get the control stucture offsets within the shared memory
     *   region and populate the control structures - we do not assume
     *   any symmetry in memory layout of each process
     */

    /* loop over the procs in the group */
    for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
        int array_id;
        pcnt=1;
        ret = opal_dss.unpack(recv_buffer,&index_in_group, &pcnt, OPAL_UINT32);
        if (OMPI_SUCCESS != ret) {
            opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote index_in_group\n",ret);
            goto exit_ERROR;
        }

        /* get the offset */
        pcnt=1;
        ret = opal_dss.unpack(recv_buffer,&rem_mem_offset, &pcnt, OPAL_UINT64);
        if (OMPI_SUCCESS != ret) {
            opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote memory offset\n",ret);
            goto exit_ERROR;
        }

        array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
        result_array[array_id]=(void *)rem_mem_offset;

    }

    /* clean up */
    peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
    while( NULL !=peer) {
        OBJ_RELEASE(peer);
        peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
    }
    OBJ_DESTRUCT(&peers);
    if( send_buffer ) {
        OBJ_RELEASE(send_buffer);
    }
    if( recv_buffer ) {
        OBJ_RELEASE(recv_buffer);
    }

    return ret;

exit_ERROR:

    /* free peer list */
    peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
    while( NULL !=peer) {
        OBJ_RELEASE(peer);
        peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
    }
    OBJ_DESTRUCT(&peers);
    if( send_buffer ) {
        OBJ_RELEASE(send_buffer);
    }
    if( recv_buffer ) {
        OBJ_RELEASE(recv_buffer);
    }
    return ret;
}
int bcol_basesmuma_hdl_zerocopy_bcast(bcol_function_args_t *input_args,
                                  coll_ml_function_t   *c_input_args)
{
    /* local variables */
    int group_size, process_shift, my_node_index;
    int my_rank, first_instance=0, flag_offset; 
    int rc = OMPI_SUCCESS;
    int my_fanout_parent;
    int leading_dim, buff_idx, idx;
	volatile int64_t ready_flag;
    int count=input_args->count;
    struct ompi_datatype_t* dtype=input_args->dtype;
    int root=input_args->root;
    int64_t sequence_number=input_args->sequence_num;
    mca_bcol_basesmuma_module_t* bcol_module=
        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;

    netpatterns_tree_node_t* my_fanout_read_tree;
    size_t pack_len = 0, dt_size;

    void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr);

    struct mca_hdl_base_descriptor_t *hdl_desc;
    struct mca_hdl_base_segment_t *hdl_seg;
    int ret, completed, ridx/*remote rank index*/; 
    bool status;
    volatile mca_bcol_basesmuma_ctl_struct_t **ctl_structs;
    mca_bcol_basesmuma_ctl_struct_t  *my_ctl_pointer= NULL;
    volatile mca_bcol_basesmuma_ctl_struct_t  *parent_ctl_pointer= NULL;
    volatile mca_bcol_basesmuma_ctl_struct_t  *child_ctl_pointer= NULL;
    struct mca_hdl_base_module_t* hdl = bcol_module->hdl_module[0];


    /* we will work only on packed data - so compute the length*/
    ompi_datatype_type_size(dtype, &dt_size);
    pack_len = count * dt_size;

    buff_idx = input_args->src_desc->buffer_index;

    /* Get addressing information */ 
    my_rank = bcol_module->super.sbgp_partner_module->my_index;
    group_size = bcol_module->colls_no_user_data.size_of_group;
    leading_dim=bcol_module->colls_no_user_data.size_of_group;
    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
    ctl_structs = (volatile mca_bcol_basesmuma_ctl_struct_t **)
        bcol_module->colls_with_user_data.ctl_buffs+idx;
    my_ctl_pointer = ctl_structs[my_rank];

    /* Align node index to around sbgp root */
    process_shift = root;
    my_node_index = my_rank - root;
    if(0 > my_node_index ) {
        my_node_index += group_size;
    }

    /* get my node for the bcast tree */
    my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); 
    my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
    if(group_size <= my_fanout_parent){
        my_fanout_parent -= group_size;
    } 

    /* setup resource recycling */
    if( my_ctl_pointer->sequence_number < sequence_number ) {
        first_instance = 1;
    }

	if( first_instance ) {
        /* Signal arrival */
        my_ctl_pointer->flag  = -1;
        my_ctl_pointer->index = 1;
        /* this does not need to use any flag values , so only need to
         * set the value for subsequent values that may need this */
        my_ctl_pointer->starting_flag_value = 0;
        flag_offset = 0;
    } else {
        /* only one thread at a time will be making progress on this
         *   collective, so no need to make this atomic */
        my_ctl_pointer->index++;
    }


    /* increment the starting flag by one and return */
    flag_offset = my_ctl_pointer->starting_flag_value;
    ready_flag = flag_offset + sequence_number + 1;
    my_ctl_pointer->sequence_number = sequence_number;

    hdl_desc = (mca_hdl_base_descriptor_t *) 
        malloc (sizeof (mca_hdl_base_descriptor_t) * 1);
    
    /*prepare a hdl data segment*/
    hdl_seg = (mca_hdl_base_segment_t*) 
        malloc ( sizeof (mca_hdl_base_segment_t) * 1);
    hdl_seg->seg_addr.pval = input_args->sbuf;
    hdl_seg->seg_len = pack_len;
    

    hdl->endpoint->ready_flag = ready_flag;
    hdl->endpoint->local_ctrl  = my_ctl_pointer;
    hdl->endpoint->sbgp_contextid = 
        bcol_module->super.sbgp_partner_module->group_comm->c_contextid;

    /*
     * Fan out from root
     */
    if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
        input_args->result_in_rbuf = false;

        hdl_desc->des_src = hdl_seg;
        hdl_desc->des_src_cnt = 1;
        hdl_desc->isroot = true;

        /*As the general semantics, there might multiple pairs of send/recv 
         *on the topology tree*/ 
        for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
            child_ctl_pointer = 
                ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; 
            hdl->endpoint->remote_ctrl = child_ctl_pointer;
            ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
            if (ret !=  OMPI_SUCCESS) {
                BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
                goto exit_ERROR;
            }
        }
    }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
        input_args->result_in_rbuf = false;
        /*
         * Get parent payload data and control data.
         * Get the pointer to the base address of the parent's payload buffer.
         * Get the parent's control buffer.
         */
        parent_ctl_pointer = ctl_structs[my_fanout_parent]; 

        hdl_desc->des_dst = hdl_seg;
        hdl_desc->des_dst_cnt = 1; 
        hdl_desc->isroot = false;
        hdl->endpoint->remote_ctrl = parent_ctl_pointer;

#if __TEST_BLOCKING__
        ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
#else
        ret = hdl->hdl_recvi(hdl, hdl->endpoint, NULL, 0, 0, &hdl_desc);
#endif

#if __TEST_WAIT__
        ret = hdl->hdl_wait(hdl, hdl->endpoint, hdl_desc); 
        BASESMUMA_VERBOSE(1,("wait on rank %d is done!", my_rank));
#endif
		if (OMPI_SUCCESS != ret) {
            BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
            goto exit_ERROR;
        }

        status = false;
#if __TEST_TEST__
        while (!status) {
            hdl->hdl_test(&hdl_desc, &completed, &status);
			opal_progress();
            BASESMUMA_VERBOSE(1, ("test on rank %d ........", my_rank));
        }
#endif

        goto Release;
        
    }else{
        input_args->result_in_rbuf = false;
        /* Interior node */

        /* Get parent payload data and control data */
        parent_ctl_pointer = ctl_structs[my_fanout_parent]; 
		
        hdl_desc->des_dst = hdl_seg;
        hdl_desc->des_dst_cnt = 1; 
        hdl_desc->isroot = false;

        hdl->endpoint->remote_ctrl = parent_ctl_pointer;

        ret = hdl->hdl_recv(hdl, hdl->endpoint, hdl_desc);
		if (OMPI_SUCCESS != ret) {
            goto exit_ERROR;
        }
		if (OMPI_SUCCESS != ret) {
            BASESMUMA_VERBOSE(1, ("recvi eror on rank %d ........", my_rank));
            goto exit_ERROR;
        }
		
        /* Signal to children that they may read the data from my shared buffer */
        MB();
        hdl_desc->des_src = hdl_seg;
        hdl_desc->des_src_cnt = 1; 
        for (ridx = 0; ridx < my_fanout_read_tree->n_children; ridx++) {
            child_ctl_pointer = 
                ctl_structs[my_fanout_read_tree->children_ranks[ridx]]; 
            hdl->endpoint->remote_ctrl = child_ctl_pointer;

            ret = hdl->hdl_send(hdl, hdl->endpoint, hdl_desc);
            if (ret !=  OMPI_SUCCESS) {
                BASESMUMA_VERBOSE(1, ("send eror on rank %d ........", my_rank));
                goto exit_ERROR;
            }
        }
        goto Release;
    }

Release:
    /* if I am the last instance of a basesmuma function in this collectie,
     *   release the resrouces */
    if (IS_LAST_BCOL_FUNC(c_input_args)) {
        rc = bcol_basesmuma_free_buff(
                &(bcol_module->colls_with_user_data),
                sequence_number);
    }

    my_ctl_pointer->starting_flag_value += 1;

    return BCOL_FN_COMPLETE;
exit_ERROR:
    return OMPI_ERROR;
}
/**
 * Shared memory blocking Broadcast - fanin, for small data buffers.
 * This routine assumes that buf (the input buffer) is a single writer
 * multi reader (SWMR) shared memory buffer owned by the calling rank
 * which is the only rank that can write to this buffers. 
 * It is also assumed that the buffers are registered and fragmented
 * at the ML level and that buf is sufficiently large to hold the data.
 *
 *
 * @param buf - SWMR shared buffer within a sbgp that the 
 * executing rank can write to.
 * @param count - the number of elements in the shared buffer.
 * @param dtype - the datatype of a shared buffer element.
 * @param root - the index within the sbgp of the root.
 * @param module - basesmuma module.
 */
int bcol_basesmuma_bcast(bcol_function_args_t *input_args,
    coll_ml_function_t *c_input_args)
{
    /* local variables */
    int group_size, process_shift, my_node_index;
    int my_rank; 
    int rc = OMPI_SUCCESS;
    int my_fanout_parent;
    int leading_dim, buff_idx, idx;
	volatile int8_t ready_flag;
    int count=input_args->count;
    struct ompi_datatype_t* dtype=input_args->dtype;
    int root=input_args->root;
    int64_t sequence_number=input_args->sequence_num;
    mca_bcol_basesmuma_module_t* bcol_module=
        (mca_bcol_basesmuma_module_t *)c_input_args->bcol_module;
    int bcol_id = (int) bcol_module->super.bcol_id;
    volatile mca_bcol_basesmuma_payload_t *data_buffs;
    volatile char* parent_data_pointer;
    mca_bcol_basesmuma_header_t *my_ctl_pointer;
    volatile mca_bcol_basesmuma_header_t *parent_ctl_pointer;
    netpatterns_tree_node_t* my_fanout_read_tree;
    size_t pack_len = 0, dt_size;

    void *data_addr = (void *)((unsigned char *)input_args->src_desc->data_addr );

#if 0 
    fprintf(stderr,"Entering sm broadcast input_args->sbuf_offset %d \n",input_args->sbuf_offset);
    fflush(stderr); 
#endif


    /* we will work only on packed data - so compute the length*/
    ompi_datatype_type_size(dtype, &dt_size);
    pack_len=count*dt_size;

    buff_idx = input_args->src_desc->buffer_index;

    /* Get addressing information */ 
    my_rank = bcol_module->super.sbgp_partner_module->my_index;
    group_size = bcol_module->colls_no_user_data.size_of_group;
    leading_dim=bcol_module->colls_no_user_data.size_of_group;
    idx=SM_ARRAY_INDEX(leading_dim,buff_idx,0);
    data_buffs=(volatile mca_bcol_basesmuma_payload_t *)
        bcol_module->colls_with_user_data.data_buffs+idx;

    /* Align node index to around sbgp root */
    process_shift = root;
    my_node_index = my_rank - root;
    if(0 > my_node_index ) {
        my_node_index += group_size;
    }

    /* get my node for the bcast tree */
    my_fanout_read_tree = &(bcol_module->fanout_read_tree[my_node_index]); 
    my_fanout_parent = my_fanout_read_tree->parent_rank + process_shift;
    if(group_size <= my_fanout_parent){
        my_fanout_parent -= group_size;
    } 

    /* Set pointer to current proc ctrl region */
    /*my_ctl_pointer = ctl_structs[my_rank]; */
    my_ctl_pointer = data_buffs[my_rank].ctl_struct; 
    
    /* setup resource recycling */
    
    BASESMUMA_HEADER_INIT(my_ctl_pointer, ready_flag, sequence_number, bcol_id);
    
    /*
     * Fan out from root
     */
    if(ROOT_NODE == my_fanout_read_tree->my_node_type) {
        input_args->result_in_rbuf = false;
        /* Root should only signal it is ready */
        my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;

    }else if(LEAF_NODE == my_fanout_read_tree->my_node_type) {
        input_args->result_in_rbuf = false;
        /*
         * Get parent payload data and control data.
         * Get the pointer to the base address of the parent's payload buffer.
         * Get the parent's control buffer.
         */
        parent_data_pointer = data_buffs[my_fanout_parent].payload;
        parent_ctl_pointer = data_buffs[my_fanout_parent].ctl_struct; 

        /* Wait until parent signals that data is ready */
		/* The order of conditions checked in this loop is important, as it can 
		 * result in a race condition. 
		 */
		while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){
				opal_progress(); 
		}
		
		/* Copy the rank to a shared buffer writable by the current rank */
        memcpy(data_addr, (void *)parent_data_pointer, pack_len);

		if( 0 != rc ) {
            return OMPI_ERROR;
        }
        
    }else{
        input_args->result_in_rbuf = false;
        /* Interior node */

        /* Get parent payload data and control data */
        parent_data_pointer = data_buffs[my_fanout_parent].payload;
        parent_ctl_pointer =  data_buffs[my_fanout_parent].ctl_struct;
		

        /* Wait until parent signals that data is ready */
		/* The order of conditions checked in this loop is important, as it can 
		 * result in a race condition. 
		 */
		while (!IS_PEER_READY(parent_ctl_pointer, ready_flag, sequence_number, BCAST_FLAG, bcol_id)){
			opal_progress();
        }
		
        /* Copy the rank to a shared buffer writable by the current rank */
        memcpy(data_addr, (void *)parent_data_pointer,pack_len);
		
        /* Signal to children that they may read the data from my shared buffer */
        MB();
        my_ctl_pointer->flags[BCAST_FLAG][bcol_id] = ready_flag;
    }

    /* if I am the last instance of a basesmuma function in this collectie,
     *   release the resrouces */

    my_ctl_pointer->starting_flag_value[bcol_id]++;

    return rc;
}