Beispiel #1
0
int down_search(int rank, int parent, int me, int num_procs,
                int *num_children, opal_list_t *children, opal_bitmap_t *relatives)
{
    int i, bitmap, peer, hibit, mask, found;
    orte_routed_tree_t *child;
    opal_bitmap_t *relations;

    /* is this me? */
    if (me == rank) {
        bitmap = opal_cube_dim(num_procs);

        hibit = opal_hibit(rank, bitmap);
        --bitmap;

        for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
            peer = rank | mask;
            if (peer < num_procs) {
                child = OBJ_NEW(orte_routed_tree_t);
                child->vpid = peer;
                if (NULL != children) {
                    /* this is a direct child - add it to my list */
                    opal_list_append(children, &child->super);
                    (*num_children)++;
                    /* setup the relatives bitmap */
                    opal_bitmap_init(&child->relatives, num_procs);
                    /* point to the relatives */
                    relations = &child->relatives;
                } else {
                    /* we are recording someone's relatives - set the bit */
                    opal_bitmap_set_bit(relatives, peer);
                    /* point to this relations */
                    relations = relatives;
                }
                /* search for this child's relatives */
                down_search(0, 0, peer, num_procs, NULL, NULL, relations);
            }
        }
        return parent;
    }

    /* find the children of this rank */
    bitmap = opal_cube_dim(num_procs);

    hibit = opal_hibit(rank, bitmap);
    --bitmap;

    for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        if (peer < num_procs) {
            /* execute compute on this child */
            if (0 <= (found = down_search(peer, rank, me, num_procs, num_children, children, relatives))) {
                return found;
            }
        }
    }
    return -1;
}
Beispiel #2
0
int mca_spml_yoda_add_procs(oshmem_proc_t** procs, size_t nprocs)
{
    opal_bitmap_t reachable;
    int rc;
    size_t i;

    if (0 == nprocs) {
        return OSHMEM_SUCCESS;
    }

    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
    rc = opal_bitmap_init(&reachable, (int) nprocs);
    if (OSHMEM_SUCCESS != rc) {
        return rc;
    }

    rc = mca_bml.bml_add_procs(nprocs, (ompi_proc_t**) procs, &reachable);

    if (OSHMEM_SUCCESS != rc) {
        SPML_ERROR("SPML YODA: shmem error\n");
        goto cleanup_and_return;
    }

    rc = mca_bml.bml_register_error(mca_spml_yoda_error_handler);
    if (OMPI_SUCCESS != rc) {
        goto cleanup_and_return;
    }

    /* create btl index and map */
    rc = create_btl_list();
    if (OSHMEM_SUCCESS != rc) {
        goto cleanup_and_return;
    }

    for (i = 0; i < nprocs; i++) {
        rc = create_btl_idx(i);
        if (OSHMEM_SUCCESS != rc) {
            goto cleanup_and_return;
        }
    }

cleanup_and_return:
    OBJ_DESTRUCT(&reachable);

    return rc;
}
Beispiel #3
0
/*
 * This will initialize the main list to store key- attribute
 * items. This will be called one time, during MPI_INIT().
 */
int ompi_attr_init(void)
{
    int ret;
    void *bogus = (void*) 1;
    int *p = (int *) &bogus;

    keyval_hash = OBJ_NEW(opal_hash_table_t);
    if (NULL == keyval_hash) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    key_bitmap = OBJ_NEW(opal_bitmap_t);
    /*
     * Set the max size to OMPI_FORTRAN_HANDLE_MAX to enforce bound
     */
    opal_bitmap_set_max_size (key_bitmap, OMPI_FORTRAN_HANDLE_MAX);
    if (0 != opal_bitmap_init(key_bitmap, 32)) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for (int_pos = 0; int_pos < (sizeof(void*) / sizeof(int));
         ++int_pos) {
        if (p[int_pos] == 1) {
            break;
        }
    }

    for (integer_pos = 0; integer_pos < (sizeof(void*) / sizeof(MPI_Fint));
         ++integer_pos) {
        if (p[integer_pos] == 1) {
            break;
        }
    }

    OBJ_CONSTRUCT(&attribute_lock, opal_mutex_t);

    if (OMPI_SUCCESS != (ret = opal_hash_table_init(keyval_hash,
                                                    ATTR_TABLE_SIZE))) {
        return ret;
    }
    if (OMPI_SUCCESS != (ret = ompi_attr_create_predefined())) {
        return ret;
    }

    return OMPI_SUCCESS;
}
Beispiel #4
0
int mca_spml_yoda_add_procs(oshmem_proc_t** procs, size_t nprocs)
{
    opal_bitmap_t reachable;
    int rc;
    size_t i;

    if (0 == nprocs) {
        return OSHMEM_SUCCESS;
    }

    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
    rc = opal_bitmap_init(&reachable, (int) nprocs);
    if (OSHMEM_SUCCESS != rc) {
        return rc;
    }

    rc = mca_bml.bml_register_error(mca_spml_yoda_error_handler);
    if (OMPI_SUCCESS != rc) {
        goto cleanup_and_return;
    }

    /* create_btl_idx requires the proc was add_proc'ed, so do it now */
    rc = MCA_PML_CALL(add_procs(procs, nprocs));
    if (OMPI_SUCCESS != rc) {
        goto cleanup_and_return;
    }

    /* create btl index and map */
    rc = create_btl_list();
    if (OSHMEM_SUCCESS != rc) {
        goto cleanup_and_return;
    }

    for (i = 0; i < nprocs; i++) {
        rc = create_btl_idx(i);
        if (OSHMEM_SUCCESS != rc) {
            goto cleanup_and_return;
        }
    }

cleanup_and_return:
    OBJ_DESTRUCT(&reachable);

    return rc;
}
static void radix_tree(int rank, int *num_children,
                       opal_list_t *children, opal_bitmap_t *relatives)
{
    int i, peer, Sum, NInLevel;
    orte_routed_tree_t *child;
    opal_bitmap_t *relations;
    
    /* compute how many procs are at my level */
    Sum=1;
    NInLevel=1;
    
    while ( Sum < (rank+1) ) {
        NInLevel *= mca_routed_radix_component.radix;
        Sum += NInLevel;
    }
    
    /* our children start at our rank + num_in_level */
    peer = rank + NInLevel;
    for (i = 0; i < mca_routed_radix_component.radix; i++) {
        if (peer < (int)orte_process_info.num_procs) {
            child = OBJ_NEW(orte_routed_tree_t);
            child->vpid = peer;
            if (NULL != children) {
                /* this is a direct child - add it to my list */
                opal_list_append(children, &child->super);
                (*num_children)++;
                /* setup the relatives bitmap */
                opal_bitmap_init(&child->relatives, orte_process_info.num_procs);
                /* point to the relatives */
                relations = &child->relatives;
            } else {
                /* we are recording someone's relatives - set the bit */
                if (OPAL_SUCCESS != opal_bitmap_set_bit(relatives, peer)) {
                    opal_output(0, "%s Error: could not set relations bit!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                }
                /* point to this relations */
                relations = relatives;
            }
            /* search for this child's relatives */
            radix_tree(peer, NULL, NULL, relations);
        }
        peer += NInLevel;
    }
}
Beispiel #6
0
static int allgather(orte_grpcomm_coll_t *coll,
                     opal_buffer_t *sendbuf)
{
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:coll:bruck algo employed for %d processes",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)coll->ndmns));
    /* get my own rank */
    coll->my_rank = ORTE_VPID_INVALID;
    for (orte_vpid_t nv = 0; nv < coll->ndmns; nv++) {
        if (coll->dmns[nv] == ORTE_PROC_MY_NAME->vpid) {
            coll->my_rank = nv;
            break;
        }
    }

    /* check for bozo case */
    if (ORTE_VPID_INVALID == coll->my_rank) {
        OPAL_OUTPUT((orte_grpcomm_base_framework.framework_output,
                     "Peer not found"));
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        brks_finalize_coll(coll, ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }

    /* record that we contributed */
    coll->nreported = 1;

    /* mark local data received */
    if (coll->ndmns > 1) {
        opal_bitmap_init (&coll->distance_mask_recv, (uint32_t) log2 (coll->ndmns) + 1);
    }

    /* start by seeding the collection with our own data */
    opal_dss.copy_payload(&coll->bucket, sendbuf);

    /* process data */
    brks_allgather_process_data (coll, 0);

    return ORTE_SUCCESS;
}
static orte_vpid_t get_routing_tree(opal_list_t *children)
{
    orte_routed_tree_t *nm;
    orte_vpid_t v;
    
    /* if I am anything other than a daemon or the HNP, this
     * is a meaningless command as I am not allowed to route
     */
    if (!ORTE_PROC_IS_DAEMON && !ORTE_PROC_IS_HNP) {
        return ORTE_VPID_INVALID;
    }
    
    /* the linear routing tree consists of a chain of daemons
     * extending from the HNP to orte_process_info.num_procs-1.
     * Accordingly, my child is just the my_vpid+1 daemon
     */
    if (NULL != children &&
        ORTE_PROC_MY_NAME->vpid < orte_process_info.num_procs-1) {
        /* my child is just the vpid+1 daemon */
        nm = OBJ_NEW(orte_routed_tree_t);
        opal_bitmap_init(&nm->relatives, orte_process_info.num_procs);
        nm->vpid = ORTE_PROC_MY_NAME->vpid + 1;
        /* my relatives are everyone above that point */
        for (v=nm->vpid+1; v < orte_process_info.num_procs; v++) {
            opal_bitmap_set_bit(&nm->relatives, v);
        }
        opal_list_append(children, &nm->super);
    }
    
    if (ORTE_PROC_IS_HNP) {
        /* the parent of the HNP is invalid */
        return ORTE_VPID_INVALID;
    }
    
    /* my parent is the my_vpid-1 daemon */
    return (ORTE_PROC_MY_NAME->vpid - 1);
}
Beispiel #8
0
/* This function checks how many processes are using the component
   'component_name' for communication and returns this count in 
   'ncount'. Furthermore it returns a 'key', which can be used to split
   the communicator into subgroups, such that the new communicators
   will definitly have all processes communicate with this component.

   Oct 13: the algorithm has been modified such that it returns the 
   number of processes using the specified component and the number
   of processes to which an even 'faster' protocol is being used. (Faster
   specified in this context as being further up in the list of 
   hier_prot protocols specified at the beginning of this file).
*/
static void 
mca_coll_hierarch_checkfor_component ( struct ompi_communicator_t *comm,
				       int component_level,
				       char *component_name, 
				       int *key,
				       int *ncount )
{
    opal_bitmap_t reachable;
    ompi_proc_t **procs=NULL;
    struct mca_bml_base_btl_array_t *bml_btl_array=NULL;
    mca_bml_base_btl_t *bml_btl=NULL;
    mca_btl_base_component_t *btl=NULL;
    mca_bml_base_endpoint_t *endpoint;

    int i, size, rc;

    int counter=0;
    int firstproc=999999;
    int rank = -1;
    int use_rdma=0;

    /* default values in case an error occurs */
    *ncount=0;
    *key=MPI_UNDEFINED;

    /* Shall we check the the rdma list instead of send-list in the endpoint-structure? */
    use_rdma = mca_coll_hierarch_use_rdma_param;
    
    size = ompi_comm_size ( comm );
    rank = ompi_comm_rank ( comm );

    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
    rc = opal_bitmap_init(&reachable, size);
    if(OMPI_SUCCESS != rc) {
        return;
    }

    procs = comm->c_local_group->grp_proc_pointers;
    rc = mca_bml.bml_add_procs ( size, procs, &reachable );
    if(OMPI_SUCCESS != rc) {
        return;
    }

    for ( i=0; i<size; i++ ) {
        if ( rank ==  i ) {
            /* skip myself */
            continue;
        }
	
        endpoint = (mca_bml_base_endpoint_t*) procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
        if ( use_rdma ) {
            bml_btl_array = &(endpoint->btl_rdma);
        }
        else {
            bml_btl_array = &(endpoint->btl_send);
        }
        bml_btl = mca_bml_base_btl_array_get_index ( bml_btl_array, 0 );
        btl = bml_btl->btl->btl_component;

        /* sanity check */
        if ( strcmp(btl->btl_version.mca_type_name,"btl") ) {
            printf("Oops, got the wrong component! type_name = %s\n",
        	   btl->btl_version.mca_type_name );
        }
	    
        /* check for the required component */
        if (! strcmp (btl->btl_version.mca_component_name, component_name)){
            counter++;
	    if (i<firstproc ) {
                firstproc = i;
	    }
	    continue;
	}	    

    }

    *ncount = counter; 
    /* final decision */
    if ( counter == 0 ) {
        /* this is the section indicating, that we are not 
           using this component */
        firstproc = MPI_UNDEFINED;
    }
    else {
        if ( rank < firstproc ) {
            firstproc = rank;
        }
    }

    *key = firstproc;

    return;
}
int main(int argc, char *argv[])
{
    /* Local variables */
    opal_bitmap_t bm;
    int err;
    
    /* Perform overall test initialization */
    test_init("opal_bitmap_t");

#ifdef STANDALONE
    error_out = stderr;
#else
    error_out = fopen( "./opal_bitmap_test_out.txt", "w" );
    if( error_out == NULL ) error_out = stderr;
#endif

    /* Initialize bitmap  */

    PRINT_VALID_ERR;
    err = opal_bitmap_init(NULL, 2);
    if (err == OPAL_ERR_BAD_PARAM)
	fprintf(error_out, "ERROR: Initialization of bitmap failed\n\n");

    PRINT_VALID_ERR;
    err = opal_bitmap_init(&bm, -1);
    if (err == OPAL_ERR_BAD_PARAM)
	fprintf(error_out, "ERROR: Initialization of bitmap failed \n\n");

    err = opal_bitmap_init(&bm, BSIZE);
    if (0 > err) {
	fprintf(error_out, "Error in bitmap create -- aborting \n");
	exit(-1);
    }

    fprintf(error_out, "\nTesting bitmap set... \n");
    test_bitmap_set(&bm);

    fprintf(error_out, "\nTesting bitmap clear ... \n");
    test_bitmap_clear(&bm);

    fprintf(error_out, "\nTesting bitmap is_set ... \n");
    test_bitmap_is_set(&bm);

    fprintf(error_out, "\nTesting bitmap clear_all... \n");
    test_bitmap_clear_all(&bm);

    fprintf(error_out, "\nTesting bitmap set_all... \n");
    test_bitmap_set_all(&bm);

    fprintf(error_out, "\nTesting bitmap find_and_set... \n");
    test_bitmap_find_and_set(&bm);

    fprintf(error_out, "\n~~~~~~     Testing complete     ~~~~~~ \n\n");

    test_finalize();
#ifndef STANDALONE
    fclose(error_out);
#endif

    return 0;
}
Beispiel #10
0
int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
{
    opal_bitmap_t reachable;
    int rc;
    opal_list_item_t *item;

    if(nprocs == 0)
        return OMPI_SUCCESS;

    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
    rc = opal_bitmap_init(&reachable, (int)nprocs);
    if(OMPI_SUCCESS != rc)
        return rc;

    /*
     * JJH: Disable this in FT enabled builds since
     * we use a wrapper PML. It will cause this check to 
     * return failure as all processes will return the wrapper PML
     * component in use instead of the wrapped PML component underneath.
     */
#if OPAL_ENABLE_FT_CR == 0
    /* make sure remote procs are using the same PML as us */
    if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("ob1",
                                                              procs,
                                                              nprocs))) {
        return rc;
    }
#endif

    rc = mca_bml.bml_add_procs( nprocs,
                                procs,
                                &reachable );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;

    /* Check that values supplied by all initialized btls will work
       for us.  Note that this is the list of all initialized BTLs,
       not the ones used for the just added procs.  This is a little
       overkill and inaccurate, as we may end up not using the BTL in
       question and all add_procs calls after the first one are
       duplicating an already completed check.  But the final
       initialization of the PML occurs before the final
       initialization of the BTLs, and iterating through the in-use
       BTLs requires iterating over the procs, as the BML does not
       expose all currently in use btls. */

    for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
         item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
         item = opal_list_get_next(item)) {
        mca_btl_base_selected_module_t *sm = 
            (mca_btl_base_selected_module_t*) item;
        if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
            opal_show_help("help-mpi-pml-ob1.txt", "eager_limit_too_small",
                           true, 
                           sm->btl_component->btl_version.mca_component_name,
                           ompi_process_info.nodename,
                           sm->btl_component->btl_version.mca_component_name,
                           sm->btl_module->btl_eager_limit,
                           sm->btl_component->btl_version.mca_component_name,
                           sizeof(mca_pml_ob1_hdr_t),
                           sm->btl_component->btl_version.mca_component_name);
            rc = OMPI_ERR_BAD_PARAM;
            goto cleanup_and_return;
        }
#if OPAL_CUDA_SUPPORT_60
        /* If size is SIZE_MAX, then we know we want to set this to the minimum possible
         * value which is the size of the PML header. */
        if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) {
            sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t);
        }
        if (0 != sm->btl_module->btl_cuda_eager_limit) {
            if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
                opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small",
                               true, 
                               sm->btl_component->btl_version.mca_component_name,
                               ompi_process_info.nodename,
                               sm->btl_component->btl_version.mca_component_name,
                               sm->btl_module->btl_cuda_eager_limit,
                               sm->btl_component->btl_version.mca_component_name,
                               sizeof(mca_pml_ob1_hdr_t),
                               sm->btl_component->btl_version.mca_component_name);
                rc = OMPI_ERR_BAD_PARAM;
                goto cleanup_and_return;
            }
        }
        if (0 == sm->btl_module->btl_cuda_rdma_limit) {
            /* All is fine.  0 means to ignore value so set to SIZE_MAX */
            sm->btl_module->btl_cuda_rdma_limit = SIZE_MAX;
        } else {
            if (sm->btl_module->btl_cuda_rdma_limit < sm->btl_module->btl_cuda_eager_limit) {
                opal_show_help("help-mpi-pml-ob1.txt", "cuda_rdma_limit_too_small",
                               true, 
                               sm->btl_component->btl_version.mca_component_name,
                               ompi_process_info.nodename,
                               sm->btl_component->btl_version.mca_component_name,
                               sm->btl_module->btl_cuda_rdma_limit,
                               sm->btl_component->btl_version.mca_component_name,
                               sm->btl_module->btl_cuda_eager_limit,
                               sm->btl_component->btl_version.mca_component_name);
                rc = OMPI_ERR_BAD_PARAM;
                goto cleanup_and_return;
            }
        }
#endif /* OPAL_CUDA_SUPPORT_60 */
    }


    /* TODO: Move these callback registration to another place */
    rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_MATCH,
                               mca_pml_ob1_recv_frag_callback_match,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_RNDV,
                               mca_pml_ob1_recv_frag_callback_rndv,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;

    rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_RGET,
                               mca_pml_ob1_recv_frag_callback_rget,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_ACK,
                               mca_pml_ob1_recv_frag_callback_ack,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_FRAG,
                               mca_pml_ob1_recv_frag_callback_frag,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_PUT,
                               mca_pml_ob1_recv_frag_callback_put,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;

    rc = mca_bml.bml_register( MCA_PML_OB1_HDR_TYPE_FIN,
                               mca_pml_ob1_recv_frag_callback_fin,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    /* register error handlers */
    rc = mca_bml.bml_register_error(mca_pml_ob1_error_handler);
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
  cleanup_and_return:
    OBJ_DESTRUCT(&reachable);

    return rc;
}
Beispiel #11
0
int mca_pml_bfo_add_procs(ompi_proc_t** procs, size_t nprocs)
{
    opal_bitmap_t reachable;
    int rc;
    size_t i;
    opal_list_item_t *item;

    if(nprocs == 0)
        return OMPI_SUCCESS;

    /* we don't have any endpoint data we need to cache on the
       ompi_proc_t, so set proc_pml to NULL */
    for (i = 0 ; i < nprocs ; ++i) {
        procs[i]->proc_pml = NULL;
    }

    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
    rc = opal_bitmap_init(&reachable, (int)nprocs);
    if(OMPI_SUCCESS != rc)
        return rc;

    /*
     * JJH: Disable this in FT enabled builds since
     * we use a wrapper PML. It will cause this check to 
     * return failure as all processes will return the wrapper PML
     * component in use instead of the wrapped PML component underneath.
     */
#if OPAL_ENABLE_FT_CR == 0
    /* make sure remote procs are using the same PML as us */
    if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("bfo",
                                                              procs,
                                                              nprocs))) {
        return rc;
    }
#endif

    rc = mca_bml.bml_add_procs( nprocs,
                                procs,
                                &reachable );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;

    /* Check that values supplied by all initialized btls will work
       for us.  Note that this is the list of all initialized BTLs,
       not the ones used for the just added procs.  This is a little
       overkill and inaccurate, as we may end up not using the BTL in
       question and all add_procs calls after the first one are
       duplicating an already completed check.  But the final
       initialization of the PML occurs before the final
       initialization of the BTLs, and iterating through the in-use
       BTLs requires iterating over the procs, as the BML does not
       expose all currently in use btls. */

    for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
         item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
         item = opal_list_get_next(item)) {
        mca_btl_base_selected_module_t *sm = 
            (mca_btl_base_selected_module_t*) item;
        if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_bfo_hdr_t)) {
	    orte_show_help("help-mpi-pml-bfo.txt", "eager_limit_too_small",
			   true, 
			   sm->btl_component->btl_version.mca_component_name,
			   orte_process_info.nodename,
			   sm->btl_component->btl_version.mca_component_name,
			   sm->btl_module->btl_eager_limit,
			   sm->btl_component->btl_version.mca_component_name,
			   sizeof(mca_pml_bfo_hdr_t),
			   sm->btl_component->btl_version.mca_component_name);
            rc = OMPI_ERR_BAD_PARAM;
            goto cleanup_and_return;
        }
    }


    /* TODO: Move these callback registration to another place */
    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_MATCH,
                               mca_pml_bfo_recv_frag_callback_match,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RNDV,
                               mca_pml_bfo_recv_frag_callback_rndv,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;

    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_RGET,
                               mca_pml_bfo_recv_frag_callback_rget,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_ACK,
                               mca_pml_bfo_recv_frag_callback_ack,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FRAG,
                               mca_pml_bfo_recv_frag_callback_frag,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_PUT,
                               mca_pml_bfo_recv_frag_callback_put,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;

    rc = mca_bml.bml_register( MCA_PML_BFO_HDR_TYPE_FIN,
                               mca_pml_bfo_recv_frag_callback_fin,
                               NULL );
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
#if PML_BFO
    rc = mca_pml_bfo_register_callbacks();
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
#endif /* PML_BFO */
    /* register error handlers */
    rc = mca_bml.bml_register_error(mca_pml_bfo_error_handler);
    if(OMPI_SUCCESS != rc)
        goto cleanup_and_return;
    
  cleanup_and_return:
    OBJ_DESTRUCT(&reachable);

    return rc;
}
Beispiel #12
0
static void pr_cons(orte_oob_base_peer_t *ptr)
{
    ptr->component = NULL;
    OBJ_CONSTRUCT(&ptr->addressable, opal_bitmap_t);
    opal_bitmap_init(&ptr->addressable, 8);
}
Beispiel #13
0
static int binomial_tree(int rank, int parent, int me, int num_procs,
                         int *nchildren, opal_list_t *childrn,
                         opal_bitmap_t *relatives, bool mine)
{
    int i, bitmap, peer, hibit, mask, found;
    orte_routed_tree_t *child;
    opal_bitmap_t *relations;

    OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output,
                         "%s routed:binomial rank %d parent %d me %d num_procs %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         rank, parent, me, num_procs));

    /* is this me? */
    if (me == rank) {
        bitmap = opal_cube_dim(num_procs);

        hibit = opal_hibit(rank, bitmap);
        --bitmap;

        for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
            peer = rank | mask;
            if (peer < num_procs) {
                child = OBJ_NEW(orte_routed_tree_t);
                child->vpid = peer;
                OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output,
                                     "%s routed:binomial %d found child %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     rank,
                                     ORTE_VPID_PRINT(child->vpid)));

                if (mine) {
                    /* this is a direct child - add it to my list */
                    opal_list_append(childrn, &child->super);
                    (*nchildren)++;
                    /* setup the relatives bitmap */
                    opal_bitmap_init(&child->relatives, num_procs);

                    /* point to the relatives */
                    relations = &child->relatives;
                } else {
                    /* we are recording someone's relatives - set the bit */
                    opal_bitmap_set_bit(relatives, peer);
                    /* point to this relations */
                    relations = relatives;
                }
                /* search for this child's relatives */
                binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false);
            }
        }
        return parent;
    }

    /* find the children of this rank */
    OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                         "%s routed:binomial find children of rank %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank));
    bitmap = opal_cube_dim(num_procs);

    hibit = opal_hibit(rank, bitmap);
    --bitmap;

    for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) {
        peer = rank | mask;
        OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                             "%s routed:binomial find children checking peer %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer));
        if (peer < num_procs) {
            OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                                 "%s routed:binomial find children computing tree",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* execute compute on this child */
            if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine))) {
                OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output,
                                     "%s routed:binomial find children returning found value %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), found));
                return found;
            }
        }
    }
    return -1;
}
Beispiel #14
0
/*
 * Group Difference has to use the dense format since we don't support
 * two parent groups in the group structure and maintain functions
 */
int ompi_group_difference(ompi_group_t* group1, ompi_group_t* group2,
                          ompi_group_t **new_group) {

    /* local varibles */
    int new_group_size, overlap_count, rc;
    ompi_group_t *new_group_pointer;
    ompi_proc_t *proc1_pointer;
    opal_bitmap_t bitmap;

    /*
     * form union
     */

    /* get new group size */
    OBJ_CONSTRUCT(&bitmap, opal_bitmap_t);
    rc = opal_bitmap_init (&bitmap, 32);
    if (OPAL_SUCCESS != rc) {
        return rc;
    }

    /* check group2 elements to see if they need to be included in the list */
    overlap_count = ompi_group_dense_overlap (group2, group1, &bitmap);
    if (0 > overlap_count) {
        OBJ_DESTRUCT(&bitmap);
        return overlap_count;
    }

    new_group_size = group1->grp_proc_count - overlap_count;
    if ( 0 == new_group_size ) {
        *new_group = MPI_GROUP_EMPTY;
        OBJ_RETAIN(MPI_GROUP_EMPTY);
        OBJ_DESTRUCT(&bitmap);
        return MPI_SUCCESS;
    }

    /* allocate a new ompi_group_t structure */
    new_group_pointer = ompi_group_allocate(new_group_size);
    if( NULL == new_group_pointer ) {
        OBJ_DESTRUCT(&bitmap);
        return MPI_ERR_GROUP;
    }

    /* fill in group list */
    /* loop over group1 members */
    for (int proc1 = 0, cnt = 0 ; proc1 < group1->grp_proc_count ; ++proc1) {
        if (opal_bitmap_is_set_bit (&bitmap, proc1)) {
            continue;
        }

        proc1_pointer = ompi_group_get_proc_ptr_raw (group1, proc1);
        new_group_pointer->grp_proc_pointers[cnt++] = proc1_pointer;
    }  /* end proc loop */

    OBJ_DESTRUCT(&bitmap);

    /* increment proc reference counters */
    ompi_group_increment_proc_count(new_group_pointer);

    /* find my rank */
    if (MPI_UNDEFINED == group1->grp_my_rank || MPI_UNDEFINED != group2->grp_my_rank) {
        new_group_pointer->grp_my_rank = MPI_UNDEFINED;
    } else {
        ompi_set_group_rank(new_group_pointer, ompi_proc_local_proc);
    }

    *new_group = (MPI_Group)new_group_pointer;

    return OMPI_SUCCESS;
}
Beispiel #15
0
static int mrhnp_pull(const orte_process_name_t* dst_name,
                    orte_iof_tag_t src_tag,
                    int fd)
{
    orte_iof_sink_t *sink;
    int flags, j;
    orte_iof_proc_t *ptr, *proct;
    opal_list_item_t *item;
    orte_job_t *jdata;
    orte_iof_job_t *jptr;
    bool found;

    /* this is a local call - only stdin is supported */
    if (ORTE_IOF_STDIN != src_tag) {
        return ORTE_ERR_NOT_SUPPORTED;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                         "%s iof:mrhnp pulling fd %d for process %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         fd, ORTE_NAME_PRINT(dst_name)));
    
    /* get the job object for this proc and check to see if it
     * is a mapper - if so, add it to the jobs that receive
     * our stdin
     */
    jdata = orte_get_job_data_object(dst_name->jobid);
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_MAPPER, NULL, OPAL_BOOL)) {
        /* see if we already have it */
        found = false;
        for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) {
            if (NULL == (jptr = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) {
                continue;
            }
            if (jptr->jdata->jobid == jdata->jobid) {
                found = true;
                break;
            }
        }
        if (!found) {
            jptr = OBJ_NEW(orte_iof_job_t);
            OBJ_RETAIN(jdata);
            jptr->jdata = jdata;
            opal_bitmap_init(&jptr->xoff, jdata->num_procs);
            opal_pointer_array_add(&mca_iof_mr_hnp_component.stdin_jobs, jptr);
        }
    }

    /* set the file descriptor to non-blocking - do this before we setup
     * the sink in case it fires right away
     */
    if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
        opal_output(orte_iof_base_framework.framework_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                    __FILE__, __LINE__, errno);
    } else {
        flags |= O_NONBLOCK;
        fcntl(fd, F_SETFL, flags);
    }
    
    ORTE_IOF_SINK_DEFINE(&sink, dst_name, fd, ORTE_IOF_STDIN,
                         stdin_write_handler, NULL);
    sink->daemon.jobid = ORTE_PROC_MY_NAME->jobid;
    sink->daemon.vpid = ORTE_PROC_MY_NAME->vpid;

    /* find the proct for this proc */
    proct = NULL;
    for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
         item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
         item = opal_list_get_next(item)) {
        ptr = (orte_iof_proc_t*)item;
        if (ptr->name.jobid == dst_name->jobid &&
            ptr->name.vpid == dst_name->vpid) {
            proct = ptr;
            break;
        }
    }
    if (NULL == proct) {
        /* we don't yet have this proc in our list */
        proct = OBJ_NEW(orte_iof_proc_t);
        proct->name.jobid = dst_name->jobid;
        proct->name.vpid = dst_name->vpid;
        opal_list_append(&mca_iof_mr_hnp_component.procs, &proct->super);
    }
    proct->sink = sink;

    return ORTE_SUCCESS;
}
Beispiel #16
0
/* Setup to read from stdin. 
 */
static int mrhnp_push(const orte_process_name_t* dst_name, orte_iof_tag_t src_tag, int fd)
{
    orte_job_t *jdata;
    orte_iof_sink_t *sink;
    orte_iof_proc_t *proct;
    opal_list_item_t *item;
    int flags;
    char *outfile;
    int fdout;
    int np, numdigs;
    orte_ns_cmp_bitmask_t mask;
    orte_iof_job_t *jptr;
    int j;
    bool found;

    /* don't do this if the dst vpid is invalid or the fd is negative! */
    if (ORTE_VPID_INVALID == dst_name->vpid || fd < 0) {
        return ORTE_SUCCESS;
    }
    
    OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output,
                         "%s iof:mrhnp pushing fd %d for process %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         fd, ORTE_NAME_PRINT(dst_name)));
    
    /* we get a push for stdout, stderr, and stddiag on every LOCAL process, so
     * setup to read those streams and forward them to the next app_context
     */
    if (!(src_tag & ORTE_IOF_STDIN)) {
        /* set the file descriptor to non-blocking - do this before we setup
         * and activate the read event in case it fires right away
         */
        if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
            opal_output(orte_iof_base_framework.framework_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                        __FILE__, __LINE__, errno);
        } else {
            flags |= O_NONBLOCK;
            fcntl(fd, F_SETFL, flags);
        }
        /* do we already have this process in our list? */
        for (item = opal_list_get_first(&mca_iof_mr_hnp_component.procs);
             item != opal_list_get_end(&mca_iof_mr_hnp_component.procs);
             item = opal_list_get_next(item)) {
            proct = (orte_iof_proc_t*)item;
            mask = ORTE_NS_CMP_ALL;
            if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, dst_name)) {
                /* found it */
                goto SETUP;
            }
        }
        /* if we get here, then we don't yet have this proc in our list */
        proct = OBJ_NEW(orte_iof_proc_t);
        proct->name.jobid = dst_name->jobid;
        proct->name.vpid = dst_name->vpid;
        opal_list_append(&mca_iof_mr_hnp_component.procs, &proct->super);
        /* see if we are to output to a file */
        if (NULL != orte_output_filename) {
            /* get the jobdata for this proc */
            if (NULL == (jdata = orte_get_job_data_object(dst_name->jobid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                return ORTE_ERR_NOT_FOUND;
            }
            np = jdata->num_procs / 10;
            /* determine the number of digits required for max vpid */
            numdigs = 1;
            while (np > 0) {
                numdigs++;
                np = np / 10;
            }
            /* construct the filename */
            asprintf(&outfile, "%s.%d.%0*lu", orte_output_filename,
                     (int)ORTE_LOCAL_JOBID(proct->name.jobid),
                     numdigs, (unsigned long)proct->name.vpid);
            /* create the file */
            fdout = open(outfile, O_CREAT|O_RDWR|O_TRUNC, 0644);
            free(outfile);
            if (fdout < 0) {
                /* couldn't be opened */
                ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
                return ORTE_ERR_FILE_OPEN_FAILURE;
            }
            /* define a sink to that file descriptor */
            ORTE_IOF_SINK_DEFINE(&sink, dst_name, fdout, ORTE_IOF_STDOUTALL,
                                 orte_iof_base_write_handler,
                                 &mca_iof_mr_hnp_component.sinks);
        }
        
    SETUP:
        /* define a read event but don't activate it */
        if (src_tag & ORTE_IOF_STDOUT) {
            ORTE_IOF_READ_EVENT(&proct->revstdout, dst_name, fd, ORTE_IOF_STDOUT,
                                orte_iof_mrhnp_read_local_handler, false);
        } else if (src_tag & ORTE_IOF_STDERR) {
            ORTE_IOF_READ_EVENT(&proct->revstderr, dst_name, fd, ORTE_IOF_STDERR,
                                orte_iof_mrhnp_read_local_handler, false);
        } else if (src_tag & ORTE_IOF_STDDIAG) {
            ORTE_IOF_READ_EVENT(&proct->revstddiag, dst_name, fd, ORTE_IOF_STDDIAG,
                                orte_iof_mrhnp_read_local_handler, false);
        }
        /* if -all- of the readevents for this proc have been defined, then
         * activate them. Otherwise, we can think that the proc is complete
         * because one of the readevents fires -prior- to all of them having been defined!
         */
        if (NULL != proct->revstdout && NULL != proct->revstderr && NULL != proct->revstddiag) {
            /* now activate read events */
            proct->revstdout->active = true;
            opal_event_add(proct->revstdout->ev, 0);
            proct->revstderr->active = true;
            opal_event_add(proct->revstderr->ev, 0);
            proct->revstddiag->active = true;
            opal_event_add(proct->revstddiag->ev, 0);
        }
        return ORTE_SUCCESS;
    }

    /*** HANDLE STDIN PUSH ***/

    /* get the job object for this proc and check to see if it
     * is a mapper - if so, add it to the jobs that receive
     * our stdin
     */
    jdata = orte_get_job_data_object(dst_name->jobid);
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_MAPPER, NULL, OPAL_BOOL)) {
        /* see if we already have it */
        found = false;
        for (j=0; j < mca_iof_mr_hnp_component.stdin_jobs.size; j++) {
            if (NULL == (jptr = (orte_iof_job_t*)opal_pointer_array_get_item(&mca_iof_mr_hnp_component.stdin_jobs, j))) {
                continue;
            }
            if (jptr->jdata->jobid == jdata->jobid) {
                found = true;
                break;
            }
        }
        if (!found) {
            jptr = OBJ_NEW(orte_iof_job_t);
            OBJ_RETAIN(jdata);
            jptr->jdata = jdata;
            opal_bitmap_init(&jptr->xoff, jdata->num_procs);
            opal_pointer_array_add(&mca_iof_mr_hnp_component.stdin_jobs, jptr);
        }
    }

    /* now setup the read - but check to only do this once */
    if (NULL == mca_iof_mr_hnp_component.stdinev) {
        /* Since we are the HNP, we don't want to set nonblocking on our
         * stdio stream.  If we do so, we set the file descriptor to
         * non-blocking for everyone that has that file descriptor, which
         * includes everyone else in our shell pipeline chain.  (See
         * http://lists.freebsd.org/pipermail/freebsd-hackers/2005-January/009742.html).
         * This causes things like "mpirun -np 1 big_app | cat" to lose
         * output, because cat's stdout is then ALSO non-blocking and cat
         * isn't built to deal with that case (same with almost all other
         * unix text utils). 
         */
        if (0 != fd) {
            if((flags = fcntl(fd, F_GETFL, 0)) < 0) {
                opal_output(orte_iof_base_framework.framework_output, "[%s:%d]: fcntl(F_GETFL) failed with errno=%d\n", 
                            __FILE__, __LINE__, errno);
            } else {
                flags |= O_NONBLOCK;
                fcntl(fd, F_SETFL, flags);
            }            
        }
        if (isatty(fd)) {
            /* We should avoid trying to read from stdin if we
             * have a terminal, but are backgrounded.  Catch the
             * signals that are commonly used when we switch
             * between being backgrounded and not.  If the
             * filedescriptor is not a tty, don't worry about it
             * and always stay connected.
             */
            opal_event_signal_set(orte_event_base, &mca_iof_mr_hnp_component.stdinsig,
                                  SIGCONT, orte_iof_mrhnp_stdin_cb,
                                  NULL);
            
            /* setup a read event to read stdin, but don't activate it yet. The
             * dst_name indicates who should receive the stdin. If that recipient
             * doesn't do a corresponding pull, however, then the stdin will
             * be dropped upon receipt at the local daemon
             */
            ORTE_IOF_READ_EVENT(&mca_iof_mr_hnp_component.stdinev,
                                dst_name, fd, ORTE_IOF_STDIN,
                                orte_iof_mrhnp_read_local_handler, false);
            
            /* check to see if we want the stdin read event to be
             * active - we will always at least define the event,
             * but may delay its activation
             */
            if (!(src_tag & ORTE_IOF_STDIN) || orte_iof_mrhnp_stdin_check(fd)) {
                mca_iof_mr_hnp_component.stdinev->active = true;
                opal_event_add(mca_iof_mr_hnp_component.stdinev->ev, 0);
            }
        } else {
            /* if we are not looking at a tty, just setup a read event
             * and activate it
             */
            ORTE_IOF_READ_EVENT(&mca_iof_mr_hnp_component.stdinev,
                                dst_name, fd, ORTE_IOF_STDIN,
                                orte_iof_mrhnp_read_local_handler, true);
        }
    }
    return ORTE_SUCCESS;
}
Beispiel #17
0
int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs)
{
    opal_bitmap_t reachable;
    int rc;
    size_t i;
    opal_list_item_t *item;

    if(nprocs == 0)
        return OMPI_SUCCESS;

#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    for (i = 0 ; i < nprocs ; ++i) {
        if (procs[i]->proc_arch != ompi_proc_local()->proc_arch) {
            return OMPI_ERR_NOT_SUPPORTED;
        }
    }
#endif

    /* make sure remote procs are using the same PML as us */
    if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("dr",
                                                              procs,
                                                              nprocs))) {
        return rc;
    }

    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
    rc = opal_bitmap_init(&reachable, (int)nprocs);
    if(OMPI_SUCCESS != rc)
        return rc;

    /* initialize bml endpoint data */
    rc = mca_bml.bml_add_procs(
                               nprocs,
                               procs,
                               &reachable
                               );
    if(OMPI_SUCCESS != rc)
        return rc;

    /* Check that values supplied by all initialized btls will work
       for us.  Note that this is the list of all initialized BTLs,
       not the ones used for the just added procs.  This is a little
       overkill and inaccurate, as we may end up not using the BTL in
       question and all add_procs calls after the first one are
       duplicating an already completed check.  But the final
       initialization of the PML occurs before the final
       initialization of the BTLs, and iterating through the in-use
       BTLs requires iterating over the procs, as the BML does not
       expose all currently in use btls. */

    for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
         item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
         item = opal_list_get_next(item)) {
        mca_btl_base_selected_module_t *sm = 
            (mca_btl_base_selected_module_t*) item;
        if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_dr_hdr_t)) {
	    orte_show_help("help-mpi-pml-dr.txt", "eager_limit_too_small",
			   true, 
			   sm->btl_component->btl_version.mca_component_name,
			   orte_process_info.nodename,
			   sm->btl_component->btl_version.mca_component_name,
			   sm->btl_module->btl_eager_limit,
			   sm->btl_component->btl_version.mca_component_name,
			   sizeof(mca_pml_dr_hdr_t),
			   sm->btl_component->btl_version.mca_component_name);
            rc = OMPI_ERR_BAD_PARAM;
            return rc;
        }
    }

    /* register recv handler */
    rc = mca_bml.bml_register(
                              MCA_BTL_TAG_PML,
                              mca_pml_dr_recv_frag_callback,
                              NULL);

    if(OMPI_SUCCESS != rc)
        return rc;

    /* register error handlers */
    rc = mca_bml.bml_register_error(mca_pml_dr_error_handler);
    
    if(OMPI_SUCCESS != rc)
        return rc;
 
    ompi_free_list_init_new(
                        &mca_pml_dr.buffers,
                        sizeof(mca_pml_dr_buffer_t) + mca_pml_dr.eager_limit,
                        opal_cache_line_size,
                        OBJ_CLASS(mca_pml_dr_buffer_t),
                        0,opal_cache_line_size,
                        0,
                        mca_pml_dr.free_list_max,
                        mca_pml_dr.free_list_inc,
                        NULL);

    /* initialize pml endpoint data */
    for (i = 0 ; i < nprocs ; ++i) {
        int idx;
        mca_pml_dr_endpoint_t *endpoint;

        endpoint = OBJ_NEW(mca_pml_dr_endpoint_t);
        endpoint->proc_ompi = procs[i];
        procs[i]->proc_pml = (struct mca_pml_endpoint_t*) endpoint;
        MCA_PML_DR_DEBUG(10, (0, "%s:%d: adding endpoint %p to proc_pml %p\n", 
                              __FILE__, __LINE__, (void*)endpoint, (void*)procs[i]));
        
        /* this won't work for comm spawn and other dynamic
           processes, but will work for initial job start */
        idx = opal_pointer_array_add(&mca_pml_dr.endpoints, (void*) endpoint);
        if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                           ORTE_PROC_MY_NAME,
                           &(endpoint->proc_ompi->proc_name))) {
            mca_pml_dr.my_rank = idx;
        }
        endpoint->local = endpoint->dst = idx;
        MCA_PML_DR_DEBUG(10, (0, "%s:%d: setting endpoint->dst to %d\n", 
                              __FILE__, __LINE__, idx));
        
        endpoint->bml_endpoint = procs[i]->proc_bml;
    }
    
    for(i = 0; i < nprocs; i++) { 
        mca_pml_dr_endpoint_t* ep =  (mca_pml_dr_endpoint_t*) 
            opal_pointer_array_get_item(&mca_pml_dr.endpoints, i);
            ep->src = mca_pml_dr.my_rank;
    }
    return rc;
}