Example #1
0
size_t mca_pml_ob1_rdma_pipeline_btls_count (mca_bml_base_endpoint_t* bml_endpoint)
{
    int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
    int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager);
    int rdma_count = 0;

    for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; ++i) {
        mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
        /* NTH: go ahead and use an rdma btl if is the only one */
        bool ignore = !mca_pml_ob1.use_all_rdma;

        for (int i = 0 ; i < num_eager_btls && ignore ; ++i) {
            mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i);
            if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) {
                ignore = false;
                break;
            }
        }

        if (!ignore) {
            ++rdma_count;
        }
    }

    return rdma_count;
}
Example #2
0
static inline mca_bml_base_btl_t *get_next_btl(int dst, int *btl_id)
{
    mca_bml_base_endpoint_t* endpoint;
    mca_bml_base_btl_t* bml_btl;
    oshmem_proc_t *proc;
    mca_bml_base_btl_array_t *btl_array = 0;
    int size = 0;
    int shmem_index = 0;

    /* get endpoint and btl */
    proc = oshmem_proc_group_all(dst);
    if (!proc) {
        SPML_ERROR("Can not find destination proc for pe=%d", dst);
        return NULL ;
    }

    endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
    if (!endpoint) {
        SPML_ERROR("pe=%d proc has no endpoint", dst);
        return NULL ;
    }

    /* At the moment always return first transport */
    size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_rdma);

    if (0 >= size) {
        /* Possibly this is SM BTL with KNEM disabled? Then we should use send based get/put */
        /*
           This hack is necessary for the case when KNEM is not available.
           In this case we still want to use send/recv of SM BTL for put and get
           but SM BTL is not in the rdma list anymore
        */
        size = mca_bml_base_btl_array_get_size(btl_array =
                &endpoint->btl_eager);
        if (0 < size) {
            /*Chose SHMEM capable btl from eager array. Not filter now: take the first
              (but could appear on demand).*/
            for (shmem_index = 0; shmem_index < size; shmem_index++) {
                bml_btl = mca_bml_base_btl_array_get_index(btl_array, shmem_index);
                _find_btl_id(bml_btl);
                size = 1;
                break;
            }
        }
    }

    bml_btl = mca_bml_base_btl_array_get_index(btl_array, shmem_index);
    *btl_id = proc->transport_ids[0];

#if SPML_YODA_DEBUG == 1
    assert(*btl_id >= 0 && *btl_id < YODA_BTL_MAX);
    SPML_VERBOSE(100, "pe=%d reachable via btl %s %d", dst,
                 bml_btl->btl->btl_component->btl_version.mca_component_name, *btl_id);
#endif
    return bml_btl;
}
Example #3
0
int mca_pml_ob1_progress(void)
{
    int i, queue_length = opal_list_get_size(&mca_pml_ob1.send_pending);
    int j, completed_requests = 0;
    bool send_succedded;

#if OPAL_CUDA_SUPPORT
    mca_pml_ob1_process_pending_cuda_async_copies();
#endif /* OPAL_CUDA_SUPPORT */

    if( OPAL_LIKELY(0 == queue_length) )
        return 0;

    for( i = 0; i < queue_length; i++ ) {
        mca_pml_ob1_send_pending_t pending_type = MCA_PML_OB1_SEND_PENDING_NONE;
        mca_pml_ob1_send_request_t* sendreq;
        mca_bml_base_endpoint_t* endpoint;

        sendreq = get_request_from_send_pending(&pending_type);
        if(OPAL_UNLIKELY(NULL == sendreq))
            break;

        switch(pending_type) {
        case MCA_PML_OB1_SEND_PENDING_NONE:
            assert(0);
            return 0;
        case MCA_PML_OB1_SEND_PENDING_SCHEDULE:
            if( mca_pml_ob1_send_request_schedule_exclusive(sendreq) ==
                OMPI_ERR_OUT_OF_RESOURCE ) {
                return 0;
            }
            completed_requests++;
            break;
        case MCA_PML_OB1_SEND_PENDING_START:
            MCA_PML_OB1_SEND_REQUEST_RESET(sendreq);
            endpoint = sendreq->req_endpoint;
            send_succedded = false;
            for(j = 0; j < (int)mca_bml_base_btl_array_get_size(&endpoint->btl_eager); j++) {
                mca_bml_base_btl_t* bml_btl;
                int rc;

                /* select a btl */
                bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
                rc = mca_pml_ob1_send_request_start_btl(sendreq, bml_btl);
                if( OPAL_LIKELY(OMPI_SUCCESS == rc) ) {
                    send_succedded = true;
                    completed_requests++;
                    break;
                }
            }
            if( false == send_succedded ) {
                add_request_to_send_pending(sendreq, MCA_PML_OB1_SEND_PENDING_START, true);
            }
        }
    }
    return completed_requests;
}
Example #4
0
size_t mca_pml_bfo_rdma_btls(
    mca_bml_base_endpoint_t* bml_endpoint,
    unsigned char* base,
    size_t size,
    mca_pml_bfo_com_btl_t* rdma_btls)
{
    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
    double weight_total = 0;
    int num_btls_used = 0, n;

    /* shortcut when there are no rdma capable btls */
    if(num_btls == 0) {
        return 0;
    }

    /* check to see if memory is registered */
    for(n = 0; n < num_btls && num_btls_used < mca_pml_bfo.max_rdma_per_request;
            n++) {
        mca_bml_base_btl_t* bml_btl =
            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
                    (bml_endpoint->btl_rdma_index + n) % num_btls);
        mca_mpool_base_registration_t* reg = &pml_bfo_dummy_reg;
        mca_mpool_base_module_t *btl_mpool = bml_btl->btl->btl_mpool;

        if( NULL != btl_mpool ) {
            if(!mca_pml_bfo.leave_pinned) {
                /* look through existing registrations */
                btl_mpool->mpool_find(btl_mpool, base, size, &reg);
            } else {
                /* register the memory */
                btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
            }

            if(NULL == reg)
                continue;
        }

        rdma_btls[num_btls_used].bml_btl = bml_btl;
        rdma_btls[num_btls_used].btl_reg = reg;
        weight_total += bml_btl->btl_weight;
        num_btls_used++;
    }

    /* if we don't use leave_pinned and all BTLs that already have this memory
     * registered amount to less then half of available bandwidth - fall back to
     * pipeline protocol */
    if(0 == num_btls_used || (!mca_pml_bfo.leave_pinned && weight_total < 0.5))
        return 0;

    mca_pml_bfo_calc_weighted_length(rdma_btls, num_btls_used, size,
                                     weight_total);

    bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
    return num_btls_used;
}
Example #5
0
static int mca_bml_r2_del_procs(size_t nprocs, 
                                struct ompi_proc_t** procs) 
{
    size_t p;
    int rc;
    struct ompi_proc_t** del_procs = (struct ompi_proc_t**) 
        malloc(nprocs * sizeof(struct ompi_proc_t*)); 
    size_t n_del_procs = 0; 

    if (NULL == del_procs) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for(p = 0; p < nprocs; p++) { 
        ompi_proc_t *proc = procs[p]; 
        if(((opal_object_t*)proc)->obj_reference_count == 1) { 
            del_procs[n_del_procs++] = proc; 
        }
    }

    for(p = 0; p < n_del_procs; p++) {
        ompi_proc_t *proc = del_procs[p];
        mca_bml_base_endpoint_t* bml_endpoint =
            (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
        size_t f_index, f_size;

        /* notify each btl that the proc is going away */
        f_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
        for(f_index = 0; f_index < f_size; f_index++) {
            mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, f_index);
            mca_btl_base_module_t* btl = bml_btl->btl;

            rc = btl->btl_del_procs(btl, 1, &proc, &bml_btl->btl_endpoint);
            if(OMPI_SUCCESS != rc) {
                free(del_procs);
                return rc;
            }

            /* The reference stored in btl_eager and btl_rdma will automatically
             * dissapear once the btl_array destructor is called. Thus, there is
             * no need for extra cleaning here.
             */
        }

        OBJ_RELEASE(proc);
        /* do any required cleanup */
        OBJ_RELEASE(bml_endpoint);
        proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
    }
    free(del_procs);

    return OMPI_SUCCESS;
}
Example #6
0
size_t mca_pml_ob1_rdma_btls(
    mca_bml_base_endpoint_t* bml_endpoint,
    unsigned char* base,
    size_t size,
    mca_pml_ob1_rdma_btl_t* rdma_btls)
{
    size_t num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
    size_t num_btls_used = 0;
    size_t n;

    /* shortcut when there are no rdma capable btls */
    if(num_btls == 0) {
        return 0;
    }

    /* check to see if memory is registered */        
    for(n = 0; n < num_btls && num_btls_used < MCA_PML_OB1_MAX_RDMA_PER_REQUEST;
            n++) {
        mca_bml_base_btl_t* bml_btl =
            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
                    (bml_endpoint->btl_rdma_index + n) % num_btls); 
        mca_mpool_base_registration_t* reg = NULL;
        mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool;

        /* btl is rdma capable and registration is not required */
        if(NULL == btl_mpool) {
            reg = NULL;
        } else {
            if(!mca_pml_ob1.leave_pinned) {
                /* look through existing registrations */
                btl_mpool->mpool_find(btl_mpool, base, size, &reg);
            } else {
                /* register the memory */
                btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
            }

            if(NULL == reg)
                bml_btl = NULL; /* skip it */
        }

        if(bml_btl != NULL) {
            rdma_btls[num_btls_used].bml_btl = bml_btl;
            rdma_btls[num_btls_used].btl_reg = reg;
            num_btls_used++;
        }
    }
    bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
    return num_btls_used;
}
Example #7
0
size_t mca_pml_ob1_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
                                       size_t size,
                                       mca_pml_ob1_com_btl_t* rdma_btls )
{
    int num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma);
    int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager);
    double weight_total = 0;
    int rdma_count = 0;

    for(int i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
        mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
        /* NTH: go ahead and use an rdma btl if is the only one */
        bool ignore = !mca_pml_ob1.use_all_rdma;

        for (int i = 0 ; i < num_eager_btls && ignore ; ++i) {
            mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i);
            if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) {
                ignore = false;
                break;
            }
        }

        if (ignore) {
            continue;
        }

        rdma_btls[rdma_count].bml_btl = bml_btl;
        rdma_btls[rdma_count++].btl_reg = NULL;

        weight_total += bml_btl->btl_weight;
    }

    mca_pml_ob1_calc_weighted_length (rdma_btls, rdma_count, size, weight_total);

    return rdma_count;
}
Example #8
0
size_t mca_pml_bfo_rdma_pipeline_btls( mca_bml_base_endpoint_t* bml_endpoint,
                                       size_t size,
                                       mca_pml_bfo_com_btl_t* rdma_btls )
{
    int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
    double weight_total = 0;

    for(i = 0; i < num_btls && i < mca_pml_bfo.max_rdma_per_request; i++) {
        rdma_btls[i].bml_btl =
            mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
        if(NULL != rdma_btls[i].bml_btl->btl->btl_mpool)
            rdma_btls[i].btl_reg = NULL;
        else
            rdma_btls[i].btl_reg = &pml_bfo_dummy_reg;

        weight_total += rdma_btls[i].bml_btl->btl_weight;
    }

    mca_pml_bfo_calc_weighted_length(rdma_btls, i, size, weight_total);

    return i;
}
Example #9
0
/* for each proc create transport ids which are indexes into global
 * btl list&map
 */
static int create_btl_idx(int dst_pe)
{
    oshmem_proc_t *proc;
    int btl_id;
    mca_bml_base_endpoint_t* endpoint;
    mca_bml_base_btl_t* bml_btl = 0;
    int i, size;
    mca_bml_base_btl_array_t *btl_array;
    int shmem_index = -1;

    proc = oshmem_proc_group_find(oshmem_group_all, dst_pe);
    endpoint = (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
    assert(endpoint);
    size = mca_bml_base_btl_array_get_size(btl_array = &endpoint->btl_rdma);

    if (0 >= size) {
        /* Possibly this is SM BTL with KNEM disabled? Then we should use send based get/put */
        /*
           This hack is necessary for the case when KNEM is not available.
           In this case we still want to use send/recv of SM BTL for put and get
           but SM BTL is not in the rdma list anymore
        */
        size = mca_bml_base_btl_array_get_size(btl_array =
                &endpoint->btl_eager);
        if (0 < size) {
            /*Chose SHMEM capable btl from eager array. Not filter now: take the first
              (but could appear on demand).*/
            shmem_index = 0;
            size = 1;
        }
        else {
            SPML_ERROR("no SHMEM capable transport for dest pe=%d", dst_pe);
            return OSHMEM_ERROR;
        }
    }

    proc->transport_ids = (char *) malloc(size * sizeof(char));
    if (!proc->transport_ids)
        return OSHMEM_ERROR;

    proc->num_transports = size;

    for (i = 0; i < size; i++) {
        bml_btl = mca_bml_base_btl_array_get_index(btl_array,
                                                   (shmem_index >= 0) ?
                                                       (shmem_index) : (i));
        btl_id = _find_btl_id(bml_btl);
        SPML_VERBOSE(50,
                     "dst_pe(%d) use btl (%s) btl_id=%d",
                     dst_pe, bml_btl->btl->btl_component->btl_version.mca_component_name, btl_id);
        if (0 > btl_id) {
            SPML_ERROR("unknown btl: dst_pe(%d) use btl (%s) btl_id=%d",
                       dst_pe, bml_btl->btl->btl_component->btl_version.mca_component_name, btl_id);
            return OSHMEM_ERROR;
        }
        proc->transport_ids[i] = btl_id;
        mca_spml_yoda.btl_type_map[btl_id].bml_btl = bml_btl;
        mca_spml_yoda.btl_type_map[btl_id].use_cnt++;
    }
    return OSHMEM_SUCCESS;
}
Example #10
0
static int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl)
{
    mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
    mca_bml_base_btl_t* bml_btl;
    mca_btl_base_module_t* ep_btl;
    double total_bandwidth = 0;
    size_t b;

    if(NULL == ep)
        return OMPI_SUCCESS;

    /* remove btl from eager list */
    mca_bml_base_btl_array_remove(&ep->btl_eager, btl);
    
    /* remove btl from send list */ 
    if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) { 
    
        /* compute total_bandwidth and 
           reset max_send_size to the min of all btl's */
        total_bandwidth = 0;
        ep->btl_max_send_size = -1;
        for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
            bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
            ep_btl = bml_btl->btl;

            total_bandwidth += ep_btl->btl_bandwidth;
            if (ep->btl_max_send_size > ep_btl->btl_max_send_size) {
                ep->btl_max_send_size = ep_btl->btl_max_send_size;
            }
        }
        
        /* compute weighting factor for this btl */
        for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
            bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
            ep_btl = bml_btl->btl;

            if(ep_btl->btl_bandwidth > 0) {
                bml_btl->btl_weight = (float)(ep_btl->btl_bandwidth / total_bandwidth);
            } else {
                bml_btl->btl_weight = (float)(1.0 / mca_bml_base_btl_array_get_size(&ep->btl_send));
            }
        }
    }

    /* remove btl from RDMA list */
    if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) { 
        
        /* compute total bandwidth */
        total_bandwidth = 0;
        ep->btl_pipeline_send_length = 0;
        ep->btl_send_limit = 0;
        for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
            bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
            ep_btl = bml_btl->btl;

            /* update aggregate endpoint info */
            total_bandwidth += ep_btl->btl_bandwidth;
            if (ep->btl_pipeline_send_length < ep_btl->btl_rdma_pipeline_send_length) {
                ep->btl_pipeline_send_length = ep_btl->btl_rdma_pipeline_send_length;
            }
            if (ep->btl_send_limit < ep_btl->btl_min_rdma_pipeline_size) {
                ep->btl_send_limit = ep_btl->btl_min_rdma_pipeline_size;
            }
        }
        
        /* compute weighting factor for this btl */
        for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
            bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
            ep_btl = bml_btl->btl;

            if(ep_btl->btl_bandwidth > 0) {
                bml_btl->btl_weight = (float)(ep_btl->btl_bandwidth / total_bandwidth);
            } else {
                bml_btl->btl_weight = (float)(1.0 / mca_bml_base_btl_array_get_size(&ep->btl_rdma));
            }
        }
    }
    
    return OMPI_SUCCESS;
}
Example #11
0
static int mca_bml_r2_del_procs(size_t nprocs, 
                                struct ompi_proc_t** procs) 
{
    size_t p;
    int rc;
    struct ompi_proc_t** del_procs = (struct ompi_proc_t**) 
        malloc(nprocs * sizeof(struct ompi_proc_t*)); 
    size_t n_del_procs = 0; 

    if (NULL == del_procs) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for(p =0; p < nprocs; p++) { 
        ompi_proc_t *proc = procs[p]; 
        if(((opal_object_t*)proc)->obj_reference_count == 1) { 
            del_procs[n_del_procs++] = proc; 
        }
    }
    
    for(p = 0; p < n_del_procs; p++) {
        ompi_proc_t *proc = del_procs[p];
        mca_bml_base_endpoint_t* bml_endpoint =
            (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
        size_t f_index, f_size;
        size_t n_index, n_size;
 
        /* notify each btl that the proc is going away */
        f_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_eager);
        for(f_index = 0; f_index < f_size; f_index++) {
            mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_eager, f_index);
            mca_btl_base_module_t* btl = bml_btl->btl;
            
            rc = btl->btl_del_procs(btl,1,&proc,&bml_btl->btl_endpoint);
            if(OMPI_SUCCESS != rc) {
                return rc;
            }

            /* remove this from next array so that we dont call it twice w/ 
             * the same address pointer
             */
            n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_eager);
            for(n_index = 0; n_index < n_size; n_index++) {
                mca_bml_base_btl_t* search_bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
                if(search_bml_btl->btl == btl) {
                    memset(search_bml_btl, 0, sizeof(mca_bml_base_btl_t));
                    break;
                }
            }
        }

        /* notify each r2 that was not in the array of r2s for first fragments */
        n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_eager, n_index);
            mca_btl_base_module_t* btl = bml_btl->btl;
            if (btl != 0) {
                rc = btl->btl_del_procs(btl,1,&proc,&bml_btl->btl_endpoint);
                if(OMPI_SUCCESS != rc) {
                    return rc;
                }
            }
        }
        
        OBJ_RELEASE(proc); 
        /* do any required cleanup */
        OBJ_RELEASE(bml_endpoint);
        
    }
    return OMPI_SUCCESS;
}
Example #12
0
static int mca_bml_r2_add_procs( size_t nprocs, 
                                 struct ompi_proc_t** procs, 
                                 struct opal_bitmap_t* reachable )
{
    size_t p, p_index, n_new_procs = 0;
    struct mca_btl_base_endpoint_t ** btl_endpoints = NULL;  
    struct ompi_proc_t** new_procs = NULL; 
    struct ompi_proc_t *unreach_proc = NULL;
    int rc, ret = OMPI_SUCCESS;

    if(0 == nprocs) {
        return OMPI_SUCCESS;
    }
    
    if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) {
        return rc;
    }
    
    /* Select only the procs that don't yet have the BML proc struct. This prevent
     * us from calling btl->add_procs several this on the same destination proc.
     */
    for(p_index = 0; p_index < nprocs; p_index++) { 
        struct ompi_proc_t* proc = procs[p_index]; 

        OBJ_RETAIN(proc); 
        if(NULL !=  proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) { 
            continue;  /* go to the next proc */
        }
        /* Allocate the new_procs on demand */
        if( NULL == new_procs ) {
            new_procs = (struct ompi_proc_t **)malloc(nprocs * sizeof(struct ompi_proc_t *));
            if( NULL == new_procs ) {
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
        }
        new_procs[n_new_procs++] = proc; 
    }

    if ( 0 == n_new_procs ) {
        return OMPI_SUCCESS;
    }

    /* Starting from here we only work on the unregistered procs */
    procs = new_procs; 
    nprocs = n_new_procs; 
    
    /* attempt to add all procs to each r2 */
    btl_endpoints = (struct mca_btl_base_endpoint_t **) 
        malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); 
    if (NULL == btl_endpoints) {
        free(new_procs);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
        mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
        int btl_inuse = 0;

        /* if the r2 can reach the destination proc it sets the
         * corresponding bit (proc index) in the reachable bitmap
         * and can return addressing information for each proc
         * that is passed back to the r2 on data transfer calls
         */
        opal_bitmap_clear_all_bits(reachable);
        memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); 

        rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable);
        if(OMPI_SUCCESS != rc) {
            /* This BTL has troubles adding the nodes. Let's continue maybe some other BTL
             * can take care of this task.
             */
            continue;
        }

        /* for each proc that is reachable */
        for( p = 0; p < n_new_procs; p++ ) {
            if(opal_bitmap_is_set_bit(reachable, p)) {
                ompi_proc_t *proc = new_procs[p]; 
                mca_bml_base_endpoint_t * bml_endpoint = 
                    (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]; 
                mca_bml_base_btl_t* bml_btl; 
                size_t size;
                
                if(NULL == bml_endpoint) { 
                    /* allocate bml specific proc data */
                    bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
                    if (NULL == bml_endpoint) {
                        opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources");
                        free(btl_endpoints);
                        free(new_procs);
                        return OMPI_ERR_OUT_OF_RESOURCE;
                    }
                    
                    /* preallocate space in array for max number of r2s */
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
                    bml_endpoint->btl_max_send_size = -1;
                    bml_endpoint->btl_proc = proc;
                    proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = bml_endpoint; 
                 
                    bml_endpoint->btl_flags_or = 0;
                }

                /* dont allow an additional BTL with a lower exclusivity ranking */
                size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
                if(size > 0) {
                    bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
                    /* skip this btl if the exclusivity is less than the previous */
                    if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) {
                        btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]);
                        continue;
                    }
                }

                /* cache the endpoint on the proc */
                bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
                bml_btl->btl = btl;
                bml_btl->btl_endpoint = btl_endpoints[p];
                bml_btl->btl_weight = 0;
                bml_btl->btl_flags = btl->btl_flags; 
                if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == btl->btl_put) ) {
                    opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
                                " the %s BTL without any PUT function attached. Disard the flag !",
                                bml_btl->btl->btl_component->btl_version.mca_component_name);
                    bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT;
                }
                if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == btl->btl_get) ) {
                    opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
                                " the %s BTL without any GET function attached. Discard the flag !",
                                bml_btl->btl->btl_component->btl_version.mca_component_name);
                    bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET;
                }
                if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
                    /**
                     * If no protocol specified, we have 2 choices: we ignore the BTL
                     * as we don't know which protocl to use, or we suppose that all
                     * BTLs support the send protocol. 
                     */
                    bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND;
                }
                /**
                 * calculate the bitwise OR of the btl flags 
                 */
                bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
                /* This BTL is in use, allow the progress registration */
                btl_inuse++;
            }
        }
        if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
            size_t p;
            bool found = false;
            for( p = 0; p < mca_bml_r2.num_btl_progress; p++ ) {
                if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
                    found = true;
                    break;
                }
            }
            if(found == false) {
                mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = 
                    btl->btl_component->btl_progress;
                mca_bml_r2.num_btl_progress++;
                opal_progress_register( btl->btl_component->btl_progress );
            }
        }
    }
    free(btl_endpoints);

    /* iterate back through procs and compute metrics for registered r2s */
    for(p=0; p<n_new_procs; p++) {
        ompi_proc_t *proc = new_procs[p];
        mca_bml_base_endpoint_t* bml_endpoint = 
            (mca_bml_base_endpoint_t*) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
        double total_bandwidth = 0;
        uint32_t latency = 0xffffffff;
        size_t n_index;
        size_t n_size;

        /* skip over procs w/ no btl's registered */
        if(NULL == bml_endpoint) {
            continue;
        }

        /* (1) determine the total bandwidth available across all btls
         *     note that we need to do this here, as we may already have btls configured
         * (2) determine the highest priority ranking for latency
         * (3) compute the maximum amount of bytes that can be send without any
         *     weighting. Once the left over is smaller than this number we will
         *     start using the weight to compute the correct amount.
         */
        n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); 
        
        /* sort BTLs in descending order according to bandwidth value */
        qsort(bml_endpoint->btl_send.bml_btls, n_size,
                sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);

        bml_endpoint->btl_rdma_index = 0;
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = 
                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
            mca_btl_base_module_t* btl = bml_btl->btl;
            total_bandwidth += bml_btl->btl->btl_bandwidth;
            if(btl->btl_latency < latency) {
                latency = btl->btl_latency;
            }
        }
        
        /* (1) set the weight of each btl as a percentage of overall bandwidth
         * (2) copy all btl instances at the highest priority ranking into the
         *     list of btls used for first fragments
         */
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = 
                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
            mca_btl_base_module_t *btl = bml_btl->btl;

            /* compute weighting factor for this r2 */
            if(btl->btl_bandwidth > 0) {
                bml_btl->btl_weight = (float)(btl->btl_bandwidth / total_bandwidth);
            } else {
                bml_btl->btl_weight = (float)(1.0 / n_size);
            }

            /* check to see if this r2 is already in the array of r2s 
             * used for first fragments - if not add it.
             */
            if(btl->btl_latency == latency) {
                mca_bml_base_btl_t* bml_btl_new = 
                    mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
                *bml_btl_new = *bml_btl;
            }

            /* set endpoint max send size as min of available btls */
            if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
               bml_endpoint->btl_max_send_size = btl->btl_max_send_size;

            /* check flags - is rdma prefered */
            if ((btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET)) &&
                !((proc->proc_arch != ompi_proc_local_proc->proc_arch) &&
                  (0 == (btl->btl_flags & MCA_BTL_FLAGS_HETEROGENEOUS_RDMA)))) {
                mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
                mca_btl_base_module_t* btl_rdma = bml_btl->btl;

                *bml_btl_rdma = *bml_btl;
                if(bml_endpoint->btl_pipeline_send_length < btl_rdma->btl_rdma_pipeline_send_length) {
                    bml_endpoint->btl_pipeline_send_length = btl_rdma->btl_rdma_pipeline_send_length;
                }
                if(bml_endpoint->btl_send_limit < btl_rdma->btl_min_rdma_pipeline_size) {
                    bml_endpoint->btl_send_limit = btl_rdma->btl_min_rdma_pipeline_size;
                }
            }
        }
    }

    /* see if we have a connection to everyone else */
    for(p=0; p<n_new_procs; p++) {
        ompi_proc_t *proc = new_procs[p];

        if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
            if (NULL == unreach_proc) {
                unreach_proc = proc;
            }
            ret = OMPI_ERR_UNREACH;
        }
    }

    if (mca_bml_r2.show_unreach_errors && 
        OMPI_ERR_UNREACH == ret) {
        opal_show_help("help-mca-bml-r2.txt",
                       "unreachable proc",
                       true, 
                       OMPI_NAME_PRINT(&(ompi_proc_local_proc->proc_name)),
                       (NULL != ompi_proc_local_proc->proc_hostname ?
                        ompi_proc_local_proc->proc_hostname : "unknown!"),
                       OMPI_NAME_PRINT(&(unreach_proc->proc_name)),
                       (NULL != ompi_proc_local_proc->proc_hostname ?
                        ompi_proc_local_proc->proc_hostname : "unknown!"),
                       btl_names);
    }

    free(new_procs); 

    return ret;
}
Example #13
0
size_t mca_pml_ob1_rdma_btls(
    mca_bml_base_endpoint_t* bml_endpoint,
    unsigned char* base,
    size_t size,
    mca_pml_ob1_com_btl_t* rdma_btls)
{
    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
    int num_eager_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager);
    double weight_total = 0;
    int num_btls_used = 0;

    /* shortcut when there are no rdma capable btls */
    if(num_btls == 0) {
        return 0;
    }

    /* check to see if memory is registered */
    for (int n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request; n++) {
        mca_bml_base_btl_t* bml_btl =
            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
                    (bml_endpoint->btl_rdma_index + n) % num_btls);
        mca_btl_base_registration_handle_t *reg_handle = NULL;
        mca_btl_base_module_t *btl = bml_btl->btl;
        /* NTH: go ahead and use an rdma btl if is the only one */
        bool ignore = !mca_pml_ob1.use_all_rdma;

        /* do not use rdma btls that are not in the eager list. this is necessary to avoid using
         * btls that exist on the endpoint only to support RMA. */
        for (int i = 0 ; i < num_eager_btls && ignore ; ++i) {
            mca_bml_base_btl_t *eager_btl = mca_bml_base_btl_array_get_index (&bml_endpoint->btl_eager, i);
            if (eager_btl->btl_endpoint == bml_btl->btl_endpoint) {
                ignore = false;
                break;
            }
        }

        if (ignore) {
            continue;
        }

        if (btl->btl_register_mem) {
            /* do not use the RDMA protocol with this btl if 1) leave pinned is disabled,
             * 2) the btl supports put, and 3) the fragment is larger than the minimum
             * pipeline size specified by the BTL */
            if (!opal_leave_pinned && (btl->btl_flags & MCA_BTL_FLAGS_PUT) &&
                  size > btl->btl_min_rdma_pipeline_size) {
                continue;
            }

            /* try to register the memory region with the btl */
            reg_handle = btl->btl_register_mem (btl, bml_btl->btl_endpoint, base,
                                                size, MCA_BTL_REG_FLAG_REMOTE_READ);
            if (NULL == reg_handle) {
                /* btl requires registration but the registration failed */
                continue;
            }
        } /* else no registration is needed with this btl */

        rdma_btls[num_btls_used].bml_btl = bml_btl;
        rdma_btls[num_btls_used].btl_reg = reg_handle;
        weight_total += bml_btl->btl_weight;
        num_btls_used++;
    }

    /* if we don't use leave_pinned and all BTLs that already have this memory
     * registered amount to less then half of available bandwidth - fall back to
     * pipeline protocol */
    if (0 == num_btls_used || (!opal_leave_pinned && weight_total < 0.5))
        return 0;

    mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
                                     weight_total);

    bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
    return num_btls_used;
}
Example #14
0
int mca_bml_r2_del_proc_btl(ompi_proc_t* proc, mca_btl_base_module_t* btl)
{
    mca_bml_base_endpoint_t* ep = (mca_bml_base_endpoint_t*)proc->proc_bml;
    double total_bandwidth = 0;
    size_t b;

    /* remove btl from eager list */
    mca_bml_base_btl_array_remove(&ep->btl_eager, btl);
    
    /* remove btl from send list */ 
    if(mca_bml_base_btl_array_remove(&ep->btl_send, btl)) { 
    
        /* compute total_bandwidth and 
           reset max_send_size to the min of all btl's */
        total_bandwidth = 0;
        for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
            mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
            total_bandwidth += bml_btl->btl->btl_bandwidth;
            if (bml_btl->btl_max_send_size < ep->btl_max_send_size) {
                ep->btl_max_send_size = bml_btl->btl->btl_max_send_size;
            }
        }
        
        /* compute weighting factor for this btl */
        for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_send); b++) {
            mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_send, b);
            if(bml_btl->btl->btl_bandwidth > 0) {
                bml_btl->btl_weight = bml_btl->btl->btl_bandwidth / total_bandwidth;
            } else {
                bml_btl->btl_weight = 1.0 / mca_bml_base_btl_array_get_size(&ep->btl_send);
            }
        }
    }
    
    /* remove btl from RDMA list */
    if(mca_bml_base_btl_array_remove(&ep->btl_rdma, btl)) { 
        
        /* computer total bandwidth */
        total_bandwidth = 0;
        for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
            mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
            /* update aggregate endpoint info */
            total_bandwidth += bml_btl->btl->btl_bandwidth;
            if (ep->btl_rdma_offset < bml_btl->btl_min_rdma_size) {
                ep->btl_rdma_offset = bml_btl->btl_min_rdma_size;
            } 
        }
        
        /* compute weighting factor for this btl */
        for(b=0; b< mca_bml_base_btl_array_get_size(&ep->btl_rdma); b++) {
            mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&ep->btl_rdma, b);
            if(bml_btl->btl->btl_bandwidth > 0) {
                bml_btl->btl_weight = bml_btl->btl->btl_bandwidth / total_bandwidth;
            } else {
                bml_btl->btl_weight = 1.0 / mca_bml_base_btl_array_get_size(&ep->btl_rdma);
            }
        }
    }
    
    return OMPI_SUCCESS;
}
Example #15
0
int mca_bml_r2_add_procs(
                         size_t nprocs, 
                         struct ompi_proc_t** procs, 
                         struct mca_bml_base_endpoint_t** bml_endpoints, 
                         struct ompi_bitmap_t* reachable
                         )
{
    size_t p;
    int rc;
    size_t p_index;
    struct mca_btl_base_endpoint_t ** btl_endpoints = NULL;  
    struct ompi_proc_t** new_procs = NULL; 
    size_t n_new_procs = 0;
    int ret = OMPI_SUCCESS;
    struct ompi_proc_t *unreach_proc = NULL;

    if(0 == nprocs) {
        return OMPI_SUCCESS;
    }
    
    if(OMPI_SUCCESS != (rc = mca_bml_r2_add_btls()) ) {
        return rc;
    }
    
    new_procs = (struct ompi_proc_t **) 
        malloc(nprocs * sizeof(struct ompi_proc_t *)); 
    if (NULL == new_procs ) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    memset(bml_endpoints, 0, nprocs * sizeof(struct mca_bml_base_endpoint_t*));

    for(p_index = 0; p_index < nprocs; p_index++) { 
        struct ompi_proc_t* proc;
        proc = procs[p_index]; 
        OBJ_RETAIN(proc); 
        
        if(NULL !=  proc->proc_bml) { 
            bml_endpoints[p_index] = 
                (mca_bml_base_endpoint_t*) proc->proc_bml; 
        } else { 
            new_procs[n_new_procs++] = proc; 
        }
    }

    if ( 0 == n_new_procs ) {
	return OMPI_SUCCESS;
    }
    
    procs = new_procs; 
    nprocs = n_new_procs; 
    
    /* attempt to add all procs to each r2 */
    btl_endpoints = (struct mca_btl_base_endpoint_t **) 
        malloc(nprocs * sizeof(struct mca_btl_base_endpoint_t*)); 
    if (NULL == btl_endpoints) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    
    for(p_index = 0; p_index < mca_bml_r2.num_btl_modules; p_index++) {
        mca_btl_base_module_t* btl = mca_bml_r2.btl_modules[p_index];
        int btl_inuse = 0;
        
        /* if the r2 can reach the destination proc it sets the
         * corresponding bit (proc index) in the reachable bitmap
         * and can return addressing information for each proc
         * that is passed back to the r2 on data transfer calls
         */
        ompi_bitmap_clear_all_bits(reachable);
        memset(btl_endpoints, 0, nprocs *sizeof(struct mca_btl_base_endpoint_t*)); 

        rc = btl->btl_add_procs(btl, n_new_procs, new_procs, btl_endpoints, reachable);
        if(OMPI_SUCCESS != rc) {
            free(btl_endpoints);
            return rc;
        }

        /* for each proc that is reachable - add the endpoint to the bml_endpoints array(s) */
        for(p=0; p<n_new_procs; p++) {
            if(ompi_bitmap_is_set_bit(reachable, p)) {
                ompi_proc_t *proc = new_procs[p]; 
                mca_bml_base_endpoint_t * bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; 
                mca_bml_base_btl_t* bml_btl; 
                size_t size;
                
                btl_inuse++;

                if(NULL == bml_endpoint) { 
                    
                    
                    /* allocate bml specific proc data */
                    bml_endpoint = OBJ_NEW(mca_bml_base_endpoint_t);
                    if (NULL == bml_endpoint) {
                        opal_output(0, "mca_bml_r2_add_procs: unable to allocate resources");
                        free(btl_endpoints);
                        return OMPI_ERR_OUT_OF_RESOURCE;
                    }
                    
                    /* preallocate space in array for max number of r2s */
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_eager, mca_bml_r2.num_btl_modules);
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_send,  mca_bml_r2.num_btl_modules);
                    mca_bml_base_btl_array_reserve(&bml_endpoint->btl_rdma,  mca_bml_r2.num_btl_modules);
                    bml_endpoint->btl_max_send_size = -1;
                    bml_endpoint->btl_proc = proc;
                    proc->proc_bml = bml_endpoint; 
                 
                    bml_endpoint->btl_flags_and = 0;
                    bml_endpoint->btl_flags_or = 0;
                }

                bml_endpoints[p] =(mca_bml_base_endpoint_t*)  proc->proc_bml; 
                
                
                /* dont allow an additional BTL with a lower exclusivity ranking */
                size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
                if(size > 0) {
                    bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, size-1);
                    /* skip this btl if the exclusivity is less than the previous */
                    if(bml_btl->btl->btl_exclusivity > btl->btl_exclusivity) {
                        if(btl_endpoints[p] != NULL) {
                            btl->btl_del_procs(btl, 1, &proc, &btl_endpoints[p]);
                        }
                        btl_inuse--;
                        continue;
                    }
                }

                /* cache the endpoint on the proc */
                bml_btl = mca_bml_base_btl_array_insert(&bml_endpoint->btl_send);
                bml_btl->btl = btl;
                bml_btl->btl_eager_limit = btl->btl_eager_limit;
                bml_btl->btl_min_send_size = btl->btl_min_send_size;
                bml_btl->btl_max_send_size = btl->btl_max_send_size;
                bml_btl->btl_min_rdma_size = btl->btl_min_rdma_size;
                bml_btl->btl_max_rdma_size = btl->btl_max_rdma_size;
                bml_btl->btl_cache = NULL;
                bml_btl->btl_endpoint = btl_endpoints[p];
                bml_btl->btl_weight = 0;
                bml_btl->btl_alloc = btl->btl_alloc;
                bml_btl->btl_free = btl->btl_free;
                bml_btl->btl_prepare_src = btl->btl_prepare_src;
                bml_btl->btl_prepare_dst = btl->btl_prepare_dst;
                bml_btl->btl_send = btl->btl_send;
                bml_btl->btl_flags = btl->btl_flags; 
                bml_btl->btl_put = btl->btl_put;
                if( (bml_btl->btl_flags & MCA_BTL_FLAGS_PUT) && (NULL == bml_btl->btl_put) ) {
                    opal_output(0, "mca_bml_r2_add_procs: The PUT flag is specified for"
                                " the %s BTL without any PUT function attached. Disard the flag !",
                                bml_btl->btl->btl_component->btl_version.mca_component_name);
                    bml_btl->btl_flags ^= MCA_BTL_FLAGS_PUT;
                }
                bml_btl->btl_get = btl->btl_get;
                if( (bml_btl->btl_flags & MCA_BTL_FLAGS_GET) && (NULL == bml_btl->btl_get) ) {
                    opal_output(0, "mca_bml_r2_add_procs: The GET flag is specified for"
                                " the %s BTL without any GET function attached. Disard the flag !",
                                bml_btl->btl->btl_component->btl_version.mca_component_name);
                    bml_btl->btl_flags ^= MCA_BTL_FLAGS_GET;
                }
                bml_btl->btl_mpool = btl->btl_mpool;
                if( (bml_btl->btl_flags & (MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_SEND)) == 0 ) {
                    /**
                     * If no protocol specified, we have 2 choices: we ignore the BTL
                     * as we don't know which protocl to use, or we suppose that all
                     * BTLs support the send protocol. 
                     */
                    bml_btl->btl_flags |= MCA_BTL_FLAGS_SEND;
                }
                /**
                 * calculate the bitwise OR and AND of the btl flags 
                 */
                bml_endpoint->btl_flags_or |= bml_btl->btl_flags;
                bml_endpoint->btl_flags_and &= bml_btl->btl_flags;
            }
        }
        if(btl_inuse > 0 && NULL != btl->btl_component->btl_progress) {
            size_t p;
            bool found = false;
            for(p=0; p<mca_bml_r2.num_btl_progress; p++) {
                if(mca_bml_r2.btl_progress[p] == btl->btl_component->btl_progress) {
                    found = true;
                    break;
                }
            }
            if(found == false) {
                mca_bml_r2.btl_progress[mca_bml_r2.num_btl_progress] = 
                    btl->btl_component->btl_progress;
                mca_bml_r2.num_btl_progress++;
            }
        }
    }
    free(btl_endpoints);

    /* iterate back through procs and compute metrics for registered r2s */
    for(p=0; p<n_new_procs; p++) {
        ompi_proc_t *proc = new_procs[p];
        mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml;
        double total_bandwidth = 0;
        uint32_t latency = 0xffffffff;
        size_t n_index;
        size_t n_size;

        /* skip over procs w/ no btl's registered */
        if(NULL == bml_endpoint) {
            continue;
        }

        /* (1) determine the total bandwidth available across all btls
         *     note that we need to do this here, as we may already have btls configured
         * (2) determine the highest priority ranking for latency
         * (3) compute the maximum amount of bytes that can be send without any
         *     weighting. Once the left over is smaller than this number we will
         *     start using the weight to compute the correct amount.
         */
        n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); 
        bml_endpoint->bml_max_send_length = 0;
        bml_endpoint->bml_max_rdma_length = 0;
        bml_endpoint->btl_rdma_index = 0;
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = 
                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
            mca_btl_base_module_t* btl = bml_btl->btl;
            total_bandwidth += bml_btl->btl->btl_bandwidth;
            if(btl->btl_latency < latency) {
                latency = btl->btl_latency;
            }
            bml_endpoint->bml_max_send_length += bml_btl->btl->btl_bandwidth;
        }
        
        /* (1) set the weight of each btl as a percentage of overall bandwidth
         * (2) copy all btl instances at the highest priority ranking into the
         *     list of btls used for first fragments
         */
        for(n_index = 0; n_index < n_size; n_index++) {
            mca_bml_base_btl_t* bml_btl = 
                mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n_index);
            mca_btl_base_module_t *btl = bml_btl->btl;

            /* compute weighting factor for this r2 */
            if(btl->btl_bandwidth > 0) {
                bml_btl->btl_weight = btl->btl_bandwidth / total_bandwidth;
            } else {
                bml_btl->btl_weight = 1.0 / n_size;
            }

            /* check to see if this r2 is already in the array of r2s 
             * used for first fragments - if not add it.
             */
            if(btl->btl_latency == latency) {
                mca_bml_base_btl_t* bml_btl_new = 
                    mca_bml_base_btl_array_insert(&bml_endpoint->btl_eager);
                *bml_btl_new = *bml_btl;
            }

            /* set endpoint max send size as min of available btls */
            if(bml_endpoint->btl_max_send_size > btl->btl_max_send_size)
               bml_endpoint->btl_max_send_size = btl->btl_max_send_size;

            /* check flags - is rdma prefered */
            if(btl->btl_flags & (MCA_BTL_FLAGS_PUT|MCA_BTL_FLAGS_GET) &&
               proc->proc_arch == ompi_proc_local_proc->proc_arch) {
                mca_bml_base_btl_t* bml_btl_rdma = mca_bml_base_btl_array_insert(&bml_endpoint->btl_rdma);
                *bml_btl_rdma = *bml_btl;
                if(bml_endpoint->btl_rdma_offset < bml_btl_rdma->btl_min_rdma_size) {
                    bml_endpoint->btl_rdma_offset = bml_btl_rdma->btl_min_rdma_size;
                }
            }
        }
    }

    /* see if we have a connection to everyone else */
    for(p=0; p<n_new_procs; p++) {
        ompi_proc_t *proc = new_procs[p];

        if (NULL == proc->proc_bml) {
            if (NULL == unreach_proc) {
                unreach_proc = proc;
            }
            ret = OMPI_ERR_UNREACH;
        }
    }

    if (mca_bml_r2.show_unreach_errors && 
        OMPI_ERR_UNREACH == ret) {
        char *local, *remote;

        orte_ns.get_proc_name_string(&local,
                                     &(ompi_proc_local_proc->proc_name));
        orte_ns.get_proc_name_string(&remote,
                                     &(unreach_proc->proc_name));

        opal_show_help("help-mca-bml-r2",
                       "unreachable proc",
                       true, local, remote, NULL);

        free(local);
        free(remote);
    }

    free(new_procs); 

    return ret;
}