int ompi_init_do_oob_preconnect(void)
{
    size_t world_size, i, next, prev, my_index = 0;
    ompi_proc_t **procs;
    int ret;
    struct iovec msg[1];

    procs = ompi_proc_world(&world_size);

    msg[0].iov_base = NULL;
    msg[0].iov_len = 0;

    if (world_size == 2) {
        if (ompi_proc_local() == procs[0]) {
            ret = orte_rml.send(&procs[1]->proc_name,
                                msg,
                                1,
                                ORTE_RML_TAG_WIREUP,
                                0);
            if (ret < 0) return ret;
        } else {
            ret = orte_rml.recv(&procs[0]->proc_name,
                                msg,
                                1,
                                ORTE_RML_TAG_WIREUP,
                                0);
            if (ret < 0) return ret;
        }
    } else if (world_size > 2) {
        for (i = 0 ; i < world_size ; ++i) {
            if (ompi_proc_local() == procs[i]) {
                my_index = i;
                break;
            }
        }

        for (i = 1 ; i <= world_size / 2 ; ++i) {
            next = (my_index + i) % world_size;
            prev = (my_index - i + world_size) % world_size;

            /* sends do not wait for a match */
            ret = orte_rml.send(&procs[next]->proc_name,
                                msg,
                                1,
                                ORTE_RML_TAG_WIREUP,
                                0);
            if (ret < 0) return ret;

            ret = orte_rml.recv(&procs[prev]->proc_name,
                                msg,
                                1,
                                ORTE_RML_TAG_WIREUP,
                                0);
            if (ret < 0) return ret;
        }
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 2
0
int
ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
                            size_t nprocs,
                            struct ompi_proc_t** procs, 
                            struct mca_mtl_base_endpoint_t **mtl_peer_data)
{
    int ret;
    size_t i;

    /* Get the list of ptl_process_id_t from the runtime and copy into structure */
    for (i = 0 ; i < nprocs ; ++i) {
        ptl_process_t *id;
        size_t size;

        if (procs[i]->proc_arch != ompi_proc_local()->proc_arch) {
            opal_output_verbose(1, ompi_mtl_base_output,
                                "Portals 4 MTL does not support heterogeneous operations.");
            opal_output_verbose(1, ompi_mtl_base_output,
                                "Proc %s architecture %x, mine %x.",
                                ORTE_NAME_PRINT(&procs[i]->proc_name), 
                                procs[i]->proc_arch, ompi_proc_local()->proc_arch);
            return OMPI_ERR_NOT_SUPPORTED;
        }

        mtl_peer_data[i] = malloc(sizeof(struct mca_mtl_base_endpoint_t));
        if (NULL == mtl_peer_data[i]) {
            opal_output_verbose(1, ompi_mtl_base_output,
                                "%s:%d: malloc failed: %d\n",
                                __FILE__, __LINE__, ret);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        ret = ompi_modex_recv(&mca_mtl_portals4_component.mtl_version,
                              procs[i], (void**) &id, &size);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        } else if (sizeof(ptl_process_t) != size) {
            opal_output_verbose(1, ompi_mtl_base_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return OMPI_ERR_BAD_PARAM;
        }

        mtl_peer_data[i]->ptl_proc = *id;
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 3
0
static void
mca_coll_hierarch_checkfor_sm ( struct ompi_communicator_t *comm, int *color,  int *ncount )
{
    int i, size;
    int lncount=0;
    struct ompi_proc_t** procs=NULL;
    struct ompi_proc_t* my_proc=NULL;


    *color = -1;
    size = ompi_comm_size(comm);
    my_proc = ompi_proc_local();
    procs = comm->c_local_group->grp_proc_pointers;
    for ( i = 0 ; i < size ; i++) {
	if ( OMPI_CAST_RTE_NAME(&procs[i]->super.proc_name)->jobid == OMPI_CAST_RTE_NAME(&my_proc->super.proc_name)->jobid &&
	     ( OPAL_PROC_ON_LOCAL_NODE(procs[i]->super.proc_flags)) ) {
	    lncount++;
	    if ( *color == -1){
		 *color = i;
	    }
	}
    }

    /* we need to decrease ncount in order to make the other allreduce/allgather 
       operations work */
    lncount--;
    *ncount = lncount;
    return;
}
Ejemplo n.º 4
0
static int vader_add_procs (struct mca_btl_base_module_t* btl,
                            size_t nprocs, struct ompi_proc_t **procs,
                            struct mca_btl_base_endpoint_t **peers,
                            opal_bitmap_t *reachability)
{
    mca_btl_vader_component_t *component = &mca_btl_vader_component;
    mca_btl_vader_t *vader_btl = (mca_btl_vader_t *) btl;
    int32_t proc, local_rank;
    ompi_proc_t *my_proc;
    int rc;

    /* initializion */

    /* get pointer to my proc structure */
    if (NULL == (my_proc = ompi_proc_local())) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* jump out if there's not someone we can talk to */
    if (1 > MCA_BTL_VADER_NUM_LOCAL_PEERS) {
        return OMPI_SUCCESS;
    }

    /* make sure that my local rank has been defined */
    if (ORTE_LOCAL_RANK_INVALID == MCA_BTL_VADER_LOCAL_RANK) {
        return OMPI_ERROR;
    }

    if (!vader_btl->btl_inited) {
        rc = vader_btl_first_time_init (vader_btl, 1 + MCA_BTL_VADER_NUM_LOCAL_PEERS);
        if (rc != OMPI_SUCCESS) {
            return rc;
        }
    }

    for (proc = 0, local_rank = 0 ; proc < (int32_t) nprocs ; ++proc) {
        /* check to see if this proc can be reached via shmem (i.e.,
           if they're on my local host and in my job) */
        if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
            !OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
            peers[proc] = NULL;
            continue;
        }

        if (my_proc != procs[proc]) {
            /* add this proc to shared memory accessibility list */
            rc = opal_bitmap_set_bit (reachability, proc);
            if(OMPI_SUCCESS != rc) {
                return rc;
            }
        }

        /* setup endpoint */
        peers[proc] = component->endpoints + local_rank;
        init_vader_endpoint (peers[proc], procs[proc], local_rank++);
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 5
0
int mca_btl_udapl_add_procs(
    struct mca_btl_base_module_t* btl, 
    size_t nprocs, 
    struct ompi_proc_t **ompi_procs, 
    struct mca_btl_base_endpoint_t** peers, 
    opal_bitmap_t* reachable)
{
    mca_btl_udapl_module_t* udapl_btl = (mca_btl_udapl_module_t*)btl;
    int i, rc;

    for(i = 0; i < (int) nprocs; i++) {

        struct ompi_proc_t* ompi_proc = ompi_procs[i];
        mca_btl_udapl_proc_t* udapl_proc;
        mca_btl_base_endpoint_t* udapl_endpoint;

        if(ompi_proc == ompi_proc_local()) 
            continue;

        if(NULL == (udapl_proc = mca_btl_udapl_proc_create(ompi_proc))) {
            continue;
        }

        OPAL_THREAD_LOCK(&udapl_proc->proc_lock);

        /* The btl_proc datastructure is shared by all uDAPL BTL
         * instances that are trying to reach this destination. 
         * Cache the peer instance on the btl_proc.
         */
        udapl_endpoint = OBJ_NEW(mca_btl_udapl_endpoint_t);
        if(NULL == udapl_endpoint) {
            OPAL_THREAD_UNLOCK(&udapl_proc->proc_lock);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        udapl_endpoint->endpoint_btl = udapl_btl;
        rc = mca_btl_udapl_proc_insert(udapl_proc, udapl_endpoint);
        if(rc != OMPI_SUCCESS) {
            OBJ_RELEASE(udapl_endpoint);
            OPAL_THREAD_UNLOCK(&udapl_proc->proc_lock);
            continue;
        }

        opal_bitmap_set_bit(reachable, i);
        OPAL_THREAD_UNLOCK(&udapl_proc->proc_lock);
        peers[i] = udapl_endpoint;
    }

    /* resize based on number of processes */
    if (OMPI_SUCCESS !=
        mca_btl_udapl_set_peer_parameters(udapl_btl, nprocs)) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 6
0
mca_btl_sctp_proc_t* mca_btl_sctp_proc_create(ompi_proc_t* ompi_proc)
{
    int rc;
    size_t size;
    mca_btl_sctp_proc_t* btl_proc;
    uint64_t hash = orte_util_hash_name(&ompi_proc->proc_name);

    OPAL_THREAD_LOCK(&mca_btl_sctp_component.sctp_lock);
    rc = opal_hash_table_get_value_uint64(&mca_btl_sctp_component.sctp_procs, 
                                          hash, (void**)&btl_proc);
    if(OMPI_SUCCESS == rc) {
        OPAL_THREAD_UNLOCK(&mca_btl_sctp_component.sctp_lock);
        return btl_proc;
    }

    btl_proc = OBJ_NEW(mca_btl_sctp_proc_t);
    if(NULL == btl_proc) {
        return NULL;
    }
    btl_proc->proc_ompi = ompi_proc;
    btl_proc->proc_name = ompi_proc->proc_name;

    /* add to hash table of all proc instance */
    opal_hash_table_set_value_uint64(&mca_btl_sctp_component.sctp_procs,
                                     hash, btl_proc);
    OPAL_THREAD_UNLOCK(&mca_btl_sctp_component.sctp_lock);

    /* lookup sctp parameters exported by this proc */
    rc = ompi_modex_recv( &mca_btl_sctp_component.super.btl_version,
            ompi_proc,
            (void**)&btl_proc->proc_addrs,
            &size );
    if(rc != OMPI_SUCCESS) {
        BTL_ERROR(("mca_base_modex_recv: failed with return value=%d", rc));
        OBJ_RELEASE(btl_proc);
        return NULL;
    }
    if(0 != (size % sizeof(mca_btl_sctp_addr_t))) {
        BTL_ERROR(("mca_base_modex_recv: invalid size %" PRIsize_t "\n", size));
        return NULL;
    }
    btl_proc->proc_addr_count = size / sizeof(mca_btl_sctp_addr_t);

    /* allocate space for endpoint array - one for each exported address */
    btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
        malloc(btl_proc->proc_addr_count * sizeof(mca_btl_base_endpoint_t*));
    if(NULL == btl_proc->proc_endpoints) {
        OBJ_RELEASE(btl_proc);
        return NULL;
    }
    if(NULL == mca_btl_sctp_component.sctp_local && ompi_proc == ompi_proc_local()) {
        mca_btl_sctp_component.sctp_local = btl_proc;
    }
    return btl_proc;
}
Ejemplo n.º 7
0
/*
 * Invoked when there's a new communicator that has been created.
 * Look at the communicator and decide which set of functions and
 * priority we want to return.
 */
mca_coll_base_module_t *
portals4_comm_query(struct ompi_communicator_t *comm,
        int *priority)
{
    mca_coll_portals4_module_t *portals4_module;
    ptl_process_t              *proc;

    /* For now, we don't support intercommunicators and we probably
       never should handle the single proc case, since there's the
       self module... */
    if (OMPI_COMM_IS_INTER(comm) || ompi_comm_size(comm) < 2) {
        return NULL;
    }

    /* Make sure someone is populating the proc table, since we're not
       in a really good position to do so */
    proc = ompi_proc_local()->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4];
    if (NULL == proc) {
        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                "%s:%d: Proc table not previously populated",
                __FILE__, __LINE__);
        return NULL;
    }

    /* check for logical addressing mode in the MTL */
    if (0 == proc->phys.pid) {
        opal_output_verbose(1, ompi_coll_base_framework.framework_output,
                "%s:%d: proc->phys.pid==0, so mtl-portals4 is using logical addressing which coll-portals4 doesn't support.  Disqualifying myself.",
                __FILE__, __LINE__);
        return NULL;
    }

    portals4_module = OBJ_NEW(mca_coll_portals4_module_t);
    if (NULL == portals4_module) return NULL;

    *priority = mca_coll_portals4_priority;
    portals4_module->coll_count = 0;
    portals4_module->super.coll_module_enable = portals4_module_enable;
    portals4_module->super.ft_event = NULL;

    portals4_module->super.coll_barrier = ompi_coll_portals4_barrier_intra;
    portals4_module->super.coll_ibarrier = ompi_coll_portals4_ibarrier_intra;

    portals4_module->super.coll_bcast = ompi_coll_portals4_bcast_intra;
    portals4_module->super.coll_ibcast = ompi_coll_portals4_ibcast_intra;

    portals4_module->super.coll_allreduce = ompi_coll_portals4_allreduce_intra;
    portals4_module->super.coll_iallreduce = ompi_coll_portals4_iallreduce_intra;

    portals4_module->super.coll_reduce = ompi_coll_portals4_reduce_intra;
    portals4_module->super.coll_ireduce = ompi_coll_portals4_ireduce_intra;

    return &(portals4_module->super);
}
Ejemplo n.º 8
0
bool mca_btl_tcp2_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint,
                                 struct sockaddr* addr, int sd)
{
    mca_btl_tcp_proc_t *endpoint_proc = btl_endpoint->endpoint_proc;
    const orte_process_name_t *this_proc = &(ompi_proc_local()->proc_name);
    int cmpval;

    if(NULL == btl_endpoint->endpoint_addr) {
        return false;
    }

    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock);
    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);

    cmpval = ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                    &endpoint_proc->proc_ompi->proc_name,
                                    this_proc);
    if((btl_endpoint->endpoint_sd < 0) ||
       (btl_endpoint->endpoint_state != MCA_BTL_TCP_CONNECTED &&
        cmpval < 0)) {
        mca_btl_tcp2_endpoint_close(btl_endpoint);
        btl_endpoint->endpoint_sd = sd;
        if(mca_btl_tcp2_endpoint_send_connect_ack(btl_endpoint) != OMPI_SUCCESS) {
            mca_btl_tcp2_endpoint_close(btl_endpoint);
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
            return false;
        }
        mca_btl_tcp_endpoint_event_init(btl_endpoint);
        /* NOT NEEDED if we remove the PERSISTENT flag when we create the
         * first recv_event.
         */
        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);  /* TODO */
        mca_btl_tcp_endpoint_connected(btl_endpoint);
#if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP
        mca_btl_tcp2_endpoint_dump(btl_endpoint, "accepted");
#endif
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        return true;
    }
    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
    return false;
}
Ejemplo n.º 9
0
/*
 * Initial query function that is invoked during MPI_INIT, allowing
 * this component to disqualify itself if it doesn't support the
 * required level of thread support.  This function is invoked exactly
 * once.
 */
int mca_coll_sm_init_query(bool enable_progress_threads,
                           bool enable_mpi_threads)
{
    ompi_proc_t *my_proc, **procs;
    size_t i, size;

    /* See if there are other procs in my job on this node.  If not,
       then don't bother going any further. */
    if (NULL == (my_proc = ompi_proc_local()) ||
        NULL == (procs = ompi_proc_all(&size))) {
        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                            "coll:sm:init_query: weirdness on procs; disqualifying myself");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    /* if no session directory was created, then we cannot be used */
    if (NULL == ompi_process_info.job_session_dir) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    if (size <= 1) {
        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                            "coll:sm:init_query: comm size too small; disqualifying myself");
        free(procs);
        return OMPI_ERR_NOT_AVAILABLE;
    }
    for (i = 0; i < size; ++i) {
        if (procs[i] != my_proc &&
            procs[i]->proc_name.jobid == my_proc->proc_name.jobid &&
            OPAL_PROC_ON_LOCAL_NODE(procs[i]->proc_flags)) {
            break;
        }
    }
    free(procs);
    if (i >= size) {
        opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                            "coll:sm:init_query: no other local procs; disqualifying myself");
        return OMPI_ERR_NOT_AVAILABLE;
    }

    /* Don't do much here because we don't really want to allocate any
       shared memory until this component is selected to be used. */
    opal_output_verbose(10, ompi_coll_base_framework.framework_output,
                        "coll:sm:init_query: pick me! pick me!");
    return OMPI_SUCCESS;
}
Ejemplo n.º 10
0
int
mca_pml_cm_add_procs(struct ompi_proc_t** procs, size_t nprocs)
{
    int ret;

#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    for (size_t i = 0 ; i < nprocs ; ++i) {
        if (procs[i]->super.proc_arch != ompi_proc_local()->super.proc_arch) {
            return OMPI_ERR_NOT_SUPPORTED;
        }
    }
#endif

    /* make sure remote procs are using the same PML as us */
    if (OMPI_SUCCESS != (ret = mca_pml_base_pml_check_selected("cm",
                                                              procs,
                                                              nprocs))) {
        return ret;
    }

    ret = OMPI_MTL_CALL(add_procs(ompi_mtl, nprocs, procs));
    return ret;
}
Ejemplo n.º 11
0
int
ompi_common_portals_ni_initialize(ptl_handle_ni_t *ni_handle, bool *accel)
{
    int ret;

    *accel = false;

    OPAL_THREAD_ADD32(&ni_usage_count, 1);
    if (PTL_INVALID_HANDLE != active_ni_h) {
        *ni_handle = active_ni_h;
        return OMPI_SUCCESS;
    }

    if (setup_utcp_params) {
        ompi_proc_t **procs;
        int my_rid = 0;
        ptl_process_id_t *info;
        char *nidmap = NULL, *pidmap = NULL;
        char *nid_str, *pid_str;
        size_t map_size = 0;
        size_t nprocs, size, i;
        char *tmp;
        ompi_proc_t* proc_self = ompi_proc_local();
        int max_interfaces;

        /* get our world */
        procs = ompi_proc_world(&nprocs);

        map_size = nprocs * 12 + 1; /* 12 is max length of long in decimal */
        nidmap = malloc(map_size);
        pidmap = malloc(map_size);
        nid_str = malloc(12 + 1);
        pid_str = malloc(12 + 1);
        if (NULL == nidmap || NULL == pidmap || 
            NULL == nid_str || NULL == pid_str)
            return OMPI_ERROR;
         
        for (i = 0 ; i < nprocs ; ++i) {
            if (proc_self == procs[i]) my_rid = i;

            ret = ompi_modex_recv(&portals_component,
                                          procs[i], (void**) &info, &size);
            if (OMPI_SUCCESS != ret) {
                opal_output(0, "%5d: ompi_modex_recv failed: %d", 
                            getpid(), ret);
                return ret;
            } else if (sizeof(ptl_process_id_t) != size) {
                opal_output(0, "%5d: ompi_modex_recv returned size %d, expected %d", 
                            getpid(), size, sizeof(ptl_process_id_t));
                return OMPI_ERROR;
            }

            if (i == 0) {
                snprintf(nidmap, map_size, "%u", ntohl(info->nid));
                snprintf(pidmap, map_size, "%u", ntohl(info->pid));
            } else {
                snprintf(nid_str, 12 + 1, ":%u", ntohl(info->nid));
                snprintf(pid_str, 12 + 1, ":%u", ntohl(info->pid));
                strncat(nidmap, nid_str, 12);
                strncat(pidmap, pid_str, 12);
            }

            free(info);
        }

        asprintf(&tmp, "PTL_MY_RID=%u", my_rid);
        putenv(tmp);
        asprintf(&tmp, "PTL_NIDMAP=%s", nidmap);
        putenv(tmp);
        asprintf(&tmp, "PTL_PIDMAP=%s", pidmap);
        putenv(tmp);
        asprintf(&tmp, "PTL_IFACE=%s", ptl_ifname);
        putenv(tmp);

        free(pidmap);
        free(nidmap);
        free(pid_str);
        free(nid_str);

        /*
         * Initialize Portals
         */

        ret = PtlInit(&max_interfaces);
        if (PTL_OK != ret) {
            opal_output(0, "%5d: PtlInit failed, returning %d\n", 
                        getpid(), ret);
            return OMPI_ERR_NOT_AVAILABLE;
        }
        init_called = true;

        /* tell the UTCP runtime code to read the env variables */
        PtlSetRank(PTL_INVALID_HANDLE, -1, -1);

        /* Initialize a network device */
        ret = PtlNIInit(PTL_IFACE_DEFAULT, /* interface to initialize */
                        PTL_PID_ANY,       /* let library assign our pid */
                        NULL,              /* no desired limits */
                        NULL,              /* no need to have limits around */
                        &active_ni_h       /* our interface handle */
                        );
        if (PTL_OK != ret) {
            opal_output(0, "%5d: PtlNIInit failed, returning %d\n", 
                        getpid(), ret);
            return OMPI_ERR_FATAL;
        }

        *ni_handle = active_ni_h;

        return OMPI_SUCCESS;
    }

    /* shouldn't ever be able to get here */
    return OMPI_ERROR;
}
Ejemplo n.º 12
0
int
ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
                            size_t nprocs,
                            struct ompi_proc_t** procs)
{
    int ret, me;
    size_t i;
    bool new_found = false;

    /* Get the list of ptl_process_id_t from the runtime and copy into structure */
    for (i = 0 ; i < nprocs ; ++i) {
        ptl_process_t *modex_id;
        size_t size;

        if( procs[i] == ompi_proc_local_proc ) {
            me = i;
        }

        if (procs[i]->super.proc_arch != ompi_proc_local()->super.proc_arch) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "Portals 4 MTL does not support heterogeneous operations.");
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "Proc %s architecture %x, mine %x.",
                                OMPI_NAME_PRINT(&procs[i]->super.proc_name), 
                                procs[i]->super.proc_arch, ompi_proc_local()->super.proc_arch);
            return OMPI_ERR_NOT_SUPPORTED;
        }

        OPAL_MODEX_RECV(ret, &mca_mtl_portals4_component.mtl_version,
                        &procs[i]->super, (char**)&modex_id, &size);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        } else if (sizeof(ptl_process_t) != size) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return OMPI_ERR_BAD_PARAM;
        }

        if (NULL == procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]) {
            ptl_process_t *peer_id;
            peer_id = malloc(sizeof(ptl_process_t));
            if (NULL == peer_id) {
                opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: malloc failed: %d\n",
                                    __FILE__, __LINE__, ret);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            *peer_id = *modex_id;
            procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4] = peer_id;

            new_found = true;
        } else {
            ptl_process_t *proc = (ptl_process_t*) procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4];
            if (proc->phys.nid != modex_id->phys.nid ||
                proc->phys.pid != modex_id->phys.pid) {
                opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: existing peer and modex peer don't match\n",
                                    __FILE__, __LINE__);
                return OMPI_ERROR;
            }
        }
    }

#if OMPI_MTL_PORTALS4_FLOW_CONTROL
    if (new_found) {
        ret = ompi_mtl_portals4_flowctl_add_procs(me, nprocs, procs);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: flowctl_add_procs failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        }
    }
#endif

    return OMPI_SUCCESS;
}
Ejemplo n.º 13
0
int mca_btl_scif_add_procs(struct mca_btl_base_module_t* btl,
                           size_t nprocs,
                           struct ompi_proc_t **procs,
                           struct mca_btl_base_endpoint_t **peers,
                           opal_bitmap_t *reachable) {
    mca_btl_scif_module_t *scif_module = (mca_btl_scif_module_t *) btl;
    size_t procs_on_board, i, board_proc;
    ompi_proc_t *my_proc = ompi_proc_local();
    int rc;

    /* determine how many procs are on this board */
    for (i = 0, procs_on_board = 0 ; i < nprocs ; ++i) {
        struct ompi_proc_t *ompi_proc = procs[i];

        if (my_proc == ompi_proc) {
            continue;
        }

        if (!OPAL_PROC_ON_LOCAL_HOST(ompi_proc->proc_flags) ||
            my_proc == ompi_proc) {
            /* scif can only be used with procs on this board */
            continue;
        }

        procs_on_board++;
    }

    /* allocate space for the detected peers and setup the mpool */
    if (NULL == scif_module->endpoints) {
        scif_module->endpoints = calloc (procs_on_board, sizeof (mca_btl_base_endpoint_t));
        if (OPAL_UNLIKELY(NULL == scif_module->endpoints)) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        rc = mca_btl_scif_setup_mpools (scif_module);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            BTL_ERROR(("btl/scif error setting up mpools/free lists"));
            return rc;
        }
    }

    for (i = 0, board_proc = 0 ; i < nprocs ; ++i) {
        struct ompi_proc_t *ompi_proc = procs[i];

        if (my_proc == ompi_proc) {
            continue;
        }

        if (!OPAL_PROC_ON_LOCAL_HOST(ompi_proc->proc_flags) ||
            my_proc == ompi_proc) {
            peers[i] = NULL;
            /* scif can only be used with procs on this board */
            continue;
        }

        /* Initialize endpoints */
        rc = mca_btl_scif_ep_init (scif_module->endpoints + board_proc, (mca_btl_scif_module_t *) btl, ompi_proc);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            BTL_ERROR(("btl/scif error initializing endpoint"));
            return rc;
        }

        scif_module->endpoints[board_proc].id = board_proc;

        /* Set the reachable bit */
        rc = opal_bitmap_set_bit (reachable, i);

        /* Store a reference to this peer */
        peers[i] = scif_module->endpoints + board_proc;

        board_proc++;
    }

    BTL_VERBOSE(("%lu procs on board\n", (unsigned long) procs_on_board));

    scif_module->endpoint_count = procs_on_board;

    /* start listening thread */
    rc = pthread_create (&mca_btl_scif_module.listen_thread, NULL, mca_btl_scif_connect_accept, NULL);
    if (0 > rc) {
        return OMPI_ERROR;
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 14
0
int
main(int argc, char* argv[])
{
    size_t packed_ddt_len;
    const void *packed_ddt;
    void *payload, *ptr;
    struct ompi_datatype_t *unpacked_dt;
    int ret = 0;
    int         blen[2];
    MPI_Aint    disp[2];
    MPI_Datatype newType, types[2], struct_type;

    MPI_Init(&argc, &argv);

    /* Basic test... */
    printf("---> Basic test with MPI_INT\n");

    packed_ddt_len = ompi_datatype_pack_description_length(MPI_INT);
    ptr = payload = malloc(packed_ddt_len);
    ret = ompi_datatype_get_pack_description(MPI_INT, &packed_ddt);
    if (ret != 0) goto cleanup;
    memcpy(payload, packed_ddt, packed_ddt_len);
    unpacked_dt = ompi_datatype_create_from_packed_description(&payload,
                  ompi_proc_local());
    free(ptr);
    if (unpacked_dt == MPI_INT) {
        printf("\tPASSED\n");
    } else {
        printf("\tFAILED: datatypes don't match\n");
        ret = 1;
        goto cleanup;
    }

    printf("---> Advanced test with hindexed\n");

    blen[0] = 10;
    blen[1] = 10;
    disp[0] = 0;
    disp[1] = 20*sizeof(double);

    ret = MPI_Type_create_hindexed(2, blen, disp, MPI_DOUBLE,
                                   &newType);
    if (ret != 0) goto cleanup;

    ret = MPI_Type_commit(&newType);
    if (ret != 0) goto cleanup;

    packed_ddt_len = ompi_datatype_pack_description_length(newType);
    ptr = payload = malloc(packed_ddt_len);
    ret = ompi_datatype_get_pack_description(newType, &packed_ddt);
    if (ret != 0) goto cleanup;
    memcpy(payload, packed_ddt, packed_ddt_len);
    unpacked_dt = ompi_datatype_create_from_packed_description(&payload,
                  ompi_proc_local());
    free(ptr);
    if (unpacked_dt != NULL) {
        printf("\tPASSED\n");
    } else {
        printf("\tFAILED: datatypes don't match\n");
        ret = 1;
        goto cleanup;
    }

    printf("---> Even more advanced test using the previous type and struct\n");
    blen[0] = 11;
    blen[1] = 2;
    disp[0] = 0;
    disp[1] = 64;
    types[0] = MPI_INT;
    types[1] = newType;
    MPI_Type_create_struct( 2, blen, disp, types, &struct_type );
    if (ret != 0) goto cleanup;

    ret = MPI_Type_commit(&struct_type);
    if (ret != 0) goto cleanup;

    packed_ddt_len = ompi_datatype_pack_description_length(struct_type);
    ptr = payload = malloc(packed_ddt_len);
    ret = ompi_datatype_get_pack_description(struct_type, &packed_ddt);
    if (ret != 0) goto cleanup;
    memcpy(payload, packed_ddt, packed_ddt_len);
    unpacked_dt = ompi_datatype_create_from_packed_description(&payload,
                  ompi_proc_local());
    free(ptr);
    if (unpacked_dt != NULL) {
        printf("\tPASSED\n");
    } else {
        printf("\tFAILED: datatypes don't match\n");
        ret = 1;
        goto cleanup;
    }

cleanup:
    MPI_Finalize();

    return ret;
}
Ejemplo n.º 15
0
mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_create(ompi_proc_t* ompi_proc)
{
    int rc;
    size_t size;
    mca_btl_tcp2_proc_t* btl_proc;
    uint64_t hash = orte_util_hash_name(&ompi_proc->proc_name);

    OPAL_THREAD_LOCK(&mca_btl_tcp2_component.tcp_lock);
    rc = opal_hash_table_get_value_uint64(&mca_btl_tcp2_component.tcp_procs, 
                                          hash, (void**)&btl_proc);
    if(OMPI_SUCCESS == rc) {
        OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock);
        return btl_proc;
    }

    btl_proc = OBJ_NEW(mca_btl_tcp2_proc_t);
    if(NULL == btl_proc)
        return NULL;
    btl_proc->proc_ompi = ompi_proc;
    
    /* add to hash table of all proc instance */
    opal_hash_table_set_value_uint64(&mca_btl_tcp2_component.tcp_procs,
                                     hash, btl_proc);
    OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock);

    /* lookup tcp parameters exported by this proc */
    rc = ompi_modex_recv( &mca_btl_tcp2_component.super.btl_version,
                                  ompi_proc,
                                  (void**)&btl_proc->proc_addrs,
                                  &size );
    if(rc != OMPI_SUCCESS) {
        BTL_ERROR(("mca_base_modex_recv: failed with return value=%d", rc));
        OBJ_RELEASE(btl_proc);
        return NULL;
    }
    if(0 != (size % sizeof(mca_btl_tcp2_addr_t))) {
        BTL_ERROR(("mca_base_modex_recv: invalid size %lu: btl-size: %lu\n",
          (unsigned long) size, (unsigned long)sizeof(mca_btl_tcp2_addr_t)));
        return NULL;
    }
    btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp2_addr_t);

    /* allocate space for endpoint array - one for each exported address */
    btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
        malloc((1 + btl_proc->proc_addr_count) *
                sizeof(mca_btl_base_endpoint_t*));
    if(NULL == btl_proc->proc_endpoints) {
        OBJ_RELEASE(btl_proc);
        return NULL;
    }
    if(NULL == mca_btl_tcp2_component.tcp_local && ompi_proc == ompi_proc_local()) {
        mca_btl_tcp2_component.tcp_local = btl_proc;
    }
    {
        /* convert the OMPI addr_family field to OS constants,
         * so we can check for AF_INET (or AF_INET6) and don't have
         * to deal with byte ordering anymore.
         */
        unsigned int i;
        for (i = 0; i < btl_proc->proc_addr_count; i++) {
            if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) {
                btl_proc->proc_addrs[i].addr_family = AF_INET;
            }
#if OPAL_WANT_IPV6
            if (MCA_BTL_TCP_AF_INET6 == btl_proc->proc_addrs[i].addr_family) {
                btl_proc->proc_addrs[i].addr_family = AF_INET6;
            }
#endif
        }
    }
    return btl_proc;
}
Ejemplo n.º 16
0
int
ompi_mtl_portals4_add_procs(struct mca_mtl_base_module_t *mtl,
                            size_t nprocs,
                            struct ompi_proc_t** procs)
{
    int ret, me;
    size_t i;
    bool new_found = false;
    ptl_process_t *maptable;

    if (ompi_mtl_portals4.use_logical) {
        maptable = malloc(sizeof(ptl_process_t) * nprocs);
        if (NULL == maptable) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: malloc failed\n",
                                __FILE__, __LINE__);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }

    /* Get the list of ptl_process_id_t from the runtime and copy into structure */
    for (i = 0 ; i < nprocs ; ++i) {
        ptl_process_t *modex_id;
        size_t size;

        if( procs[i] == ompi_proc_local_proc ) {
            me = i;
        }

        if (procs[i]->super.proc_arch != ompi_proc_local()->super.proc_arch) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "Portals 4 MTL does not support heterogeneous operations.");
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "Proc %s architecture %x, mine %x.",
                                OMPI_NAME_PRINT(&procs[i]->super.proc_name),
                                procs[i]->super.proc_arch, ompi_proc_local()->super.proc_arch);
            return OMPI_ERR_NOT_SUPPORTED;
        }

        OPAL_MODEX_RECV(ret, &mca_mtl_portals4_component.mtl_version,
                        &procs[i]->super.proc_name, (uint8_t**)&modex_id, &size);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        } else if (sizeof(ptl_process_t) != size) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: ompi_modex_recv failed: %d\n",
                                __FILE__, __LINE__, ret);
            return OMPI_ERR_BAD_PARAM;
        }

        if (NULL == procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4]) {
            ptl_process_t *peer_id;
            peer_id = malloc(sizeof(ptl_process_t));
            if (NULL == peer_id) {
                opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: malloc failed: %d\n",
                                    __FILE__, __LINE__, ret);
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            if (ompi_mtl_portals4.use_logical) {
                peer_id->rank = i;
                maptable[i].phys.pid = modex_id->phys.pid;
                maptable[i].phys.nid = modex_id->phys.nid;
                opal_output_verbose(50, ompi_mtl_base_framework.framework_output,
                    "logical: global rank=%d pid=%d nid=%d\n",
                    (int)i, maptable[i].phys.pid, maptable[i].phys.nid);
            } else {
                *peer_id = *modex_id;
            }

            procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4] = peer_id;

            new_found = true;
        } else {
            ptl_process_t *proc = (ptl_process_t*) procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PORTALS4];
            if (ompi_mtl_portals4.use_logical) {
                if ((size_t)proc->rank != i) {
                    opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: existing peer and rank don't match\n",
                                    __FILE__, __LINE__);
                    return OMPI_ERROR;
                }
                maptable[i].phys.pid = modex_id->phys.pid;
                maptable[i].phys.nid = modex_id->phys.nid;
            }
            else if (proc->phys.nid != modex_id->phys.nid ||
                     proc->phys.pid != modex_id->phys.pid) {
                opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                    "%s:%d: existing peer and modex peer don't match\n",
                                    __FILE__, __LINE__);
                return OMPI_ERROR;
            }
        }
    }

    if (ompi_mtl_portals4.use_logical) {
        ret = PtlSetMap(ompi_mtl_portals4.ni_h, nprocs, maptable);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: logical mapping failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        }
        opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                            "logical mapping OK\n");
        free(maptable);
    }

    portals4_init_interface();

    /* activate progress callback */
    ret = opal_progress_register(ompi_mtl_portals4_progress);
    if (OMPI_SUCCESS != ret) {
        opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                            "%s:%d: opal_progress_register failed: %d\n",
                            __FILE__, __LINE__, ret);
        return ret;
    }

#if OMPI_MTL_PORTALS4_FLOW_CONTROL
    if (new_found) {
        ret = ompi_mtl_portals4_flowctl_add_procs(me, nprocs, procs);
        if (OMPI_SUCCESS != ret) {
            opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
                                "%s:%d: flowctl_add_procs failed: %d\n",
                                __FILE__, __LINE__, ret);
            return ret;
        }
    }
#endif

    return OMPI_SUCCESS;
}
Ejemplo n.º 17
0
int mca_btl_sctp_add_procs(
    struct mca_btl_base_module_t* btl, 
    size_t nprocs, 
    struct ompi_proc_t **ompi_procs, 
    struct mca_btl_base_endpoint_t** peers, 
    opal_bitmap_t* reachable)
{
    mca_btl_sctp_module_t* sctp_btl = (mca_btl_sctp_module_t*)btl;
    ompi_proc_t* my_proc; /* pointer to caller's proc structure */
    int i, rc;

    /* get pointer to my proc structure */
    my_proc = ompi_proc_local();
    if( NULL == my_proc ) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for(i = 0; i < (int) nprocs; i++) {

        struct ompi_proc_t* ompi_proc = ompi_procs[i];
        mca_btl_sctp_proc_t* sctp_proc;
        mca_btl_base_endpoint_t* sctp_endpoint;

        /* Do not create loopback SCTP connections */
        if( my_proc == ompi_proc ) {
            continue;
        }

        if(NULL == (sctp_proc = mca_btl_sctp_proc_create(ompi_proc))) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        /*
         * Check to make sure that the peer has at least as many interface 
         * addresses exported as we are trying to use. If not, then 
         * don't bind this BTL instance to the proc.
         */

        OPAL_THREAD_LOCK(&sctp_proc->proc_lock);

        /* The btl_proc datastructure is shared by all SCTP BTL
         * instances that are trying to reach this destination. 
         * Cache the peer instance on the btl_proc.
         */
        sctp_endpoint = OBJ_NEW(mca_btl_sctp_endpoint_t);
        if(NULL == sctp_endpoint) {
            OPAL_THREAD_UNLOCK(&sctp_proc->proc_lock);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        sctp_endpoint->endpoint_btl = sctp_btl;
        rc = mca_btl_sctp_proc_insert(sctp_proc, sctp_endpoint);
        if(rc != OMPI_SUCCESS) {
            OBJ_RELEASE(sctp_endpoint);
            OPAL_THREAD_UNLOCK(&sctp_proc->proc_lock);
            continue;
        }

        opal_bitmap_set_bit(reachable, i);
        OPAL_THREAD_UNLOCK(&sctp_proc->proc_lock);
        peers[i] = sctp_endpoint;
        opal_list_append(&sctp_btl->sctp_endpoints, (opal_list_item_t*)sctp_endpoint);

        /* we increase the count of MPI users of the event library
           once per peer, so that we are used until we aren't
           connected to a peer */
        opal_progress_event_users_increment();
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 18
0
/*
 * Initialize comm world/self/null/parent.
 */
int ompi_comm_init(void)
{
    ompi_group_t *group;
    size_t size;

    /* Setup communicator array */
    OBJ_CONSTRUCT(&ompi_mpi_communicators, opal_pointer_array_t); 
    if( OPAL_SUCCESS != opal_pointer_array_init(&ompi_mpi_communicators, 0,
                                                OMPI_FORTRAN_HANDLE_MAX, 64) ) {
        return OMPI_ERROR;
    }

    /* Setup MPI_COMM_WORLD */
    OBJ_CONSTRUCT(&ompi_mpi_comm_world, ompi_communicator_t);
    group = OBJ_NEW(ompi_group_t);
    group->grp_proc_pointers = ompi_proc_world(&size);
    group->grp_proc_count    = (int)size;
    OMPI_GROUP_SET_INTRINSIC (group);
    OMPI_GROUP_SET_DENSE (group);
    ompi_set_group_rank(group, ompi_proc_local());
    ompi_group_increment_proc_count (group);

    ompi_mpi_comm_world.comm.c_contextid    = 0;
    ompi_mpi_comm_world.comm.c_id_start_index = 4;
    ompi_mpi_comm_world.comm.c_id_available = 4;
    ompi_mpi_comm_world.comm.c_f_to_c_index = 0;
    ompi_mpi_comm_world.comm.c_my_rank      = group->grp_my_rank;
    ompi_mpi_comm_world.comm.c_local_group  = group;
    ompi_mpi_comm_world.comm.c_remote_group = group;
    OBJ_RETAIN(ompi_mpi_comm_world.comm.c_remote_group);
    ompi_mpi_comm_world.comm.c_cube_dim     = opal_cube_dim((int)size);
    ompi_mpi_comm_world.comm.error_handler  = &ompi_mpi_errors_are_fatal.eh;
    OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh );
    OMPI_COMM_SET_PML_ADDED(&ompi_mpi_comm_world.comm);
    opal_pointer_array_set_item (&ompi_mpi_communicators, 0, &ompi_mpi_comm_world);

    MEMCHECKER (memset (ompi_mpi_comm_world.comm.c_name, 0, MPI_MAX_OBJECT_NAME));
    strncpy (ompi_mpi_comm_world.comm.c_name, "MPI_COMM_WORLD",
             strlen("MPI_COMM_WORLD")+1 );
    ompi_mpi_comm_world.comm.c_flags |= OMPI_COMM_NAMEISSET;
    ompi_mpi_comm_world.comm.c_flags |= OMPI_COMM_INTRINSIC;

    /* We have to create a hash (although it is legal to leave this
       filed NULL -- the attribute accessor functions will intepret
       this as "there are no attributes cached on this object")
       because MPI_COMM_WORLD has some predefined attributes. */
    ompi_attr_hash_init(&ompi_mpi_comm_world.comm.c_keyhash);

    /* Setup MPI_COMM_SELF */
    OBJ_CONSTRUCT(&ompi_mpi_comm_self, ompi_communicator_t);
    group = OBJ_NEW(ompi_group_t);
    group->grp_proc_pointers = ompi_proc_self(&size);
    group->grp_my_rank       = 0;
    group->grp_proc_count    = (int)size;
    OMPI_GROUP_SET_INTRINSIC (group);
    OMPI_GROUP_SET_DENSE (group);
    
    ompi_mpi_comm_self.comm.c_contextid    = 1;
    ompi_mpi_comm_self.comm.c_f_to_c_index = 1;
    ompi_mpi_comm_self.comm.c_id_start_index = 20;
    ompi_mpi_comm_self.comm.c_id_available = 20;
    ompi_mpi_comm_self.comm.c_my_rank      = group->grp_my_rank;
    ompi_mpi_comm_self.comm.c_local_group  = group;
    ompi_mpi_comm_self.comm.c_remote_group = group;
    OBJ_RETAIN(ompi_mpi_comm_self.comm.c_remote_group);
    ompi_mpi_comm_self.comm.error_handler  = &ompi_mpi_errors_are_fatal.eh;
    OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh );
    OMPI_COMM_SET_PML_ADDED(&ompi_mpi_comm_self.comm);
    opal_pointer_array_set_item (&ompi_mpi_communicators, 1, &ompi_mpi_comm_self);

    MEMCHECKER (memset (ompi_mpi_comm_self.comm.c_name, 0, MPI_MAX_OBJECT_NAME));
    strncpy(ompi_mpi_comm_self.comm.c_name,"MPI_COMM_SELF",strlen("MPI_COMM_SELF")+1);
    ompi_mpi_comm_self.comm.c_flags |= OMPI_COMM_NAMEISSET;
    ompi_mpi_comm_self.comm.c_flags |= OMPI_COMM_INTRINSIC;

    /* We can set MPI_COMM_SELF's keyhash to NULL because it has no
       predefined attributes.  If a user defines an attribute on
       MPI_COMM_SELF, the keyhash will automatically be created. */
    ompi_mpi_comm_self.comm.c_keyhash = NULL;

    /* Setup MPI_COMM_NULL */
    OBJ_CONSTRUCT(&ompi_mpi_comm_null, ompi_communicator_t);
    ompi_mpi_comm_null.comm.c_local_group  = &ompi_mpi_group_null.group;
    ompi_mpi_comm_null.comm.c_remote_group = &ompi_mpi_group_null.group;
    OBJ_RETAIN(&ompi_mpi_group_null.group); 
    OBJ_RETAIN(&ompi_mpi_group_null.group);

    ompi_mpi_comm_null.comm.c_contextid    = 2;
    ompi_mpi_comm_null.comm.c_f_to_c_index = 2;
    ompi_mpi_comm_null.comm.c_my_rank      = MPI_PROC_NULL;

    ompi_mpi_comm_null.comm.error_handler  = &ompi_mpi_errors_are_fatal.eh;
    OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh );
    opal_pointer_array_set_item (&ompi_mpi_communicators, 2, &ompi_mpi_comm_null);

    MEMCHECKER (memset (ompi_mpi_comm_null.comm.c_name, 0, MPI_MAX_OBJECT_NAME));
    strncpy(ompi_mpi_comm_null.comm.c_name,"MPI_COMM_NULL",strlen("MPI_COMM_NULL")+1);
    ompi_mpi_comm_null.comm.c_flags |= OMPI_COMM_NAMEISSET;
    ompi_mpi_comm_null.comm.c_flags |= OMPI_COMM_INTRINSIC;

    /* Initialize the parent communicator to MPI_COMM_NULL */
    ompi_mpi_comm_parent = &ompi_mpi_comm_null.comm;
    OBJ_RETAIN(&ompi_mpi_comm_null);
    OBJ_RETAIN(&ompi_mpi_group_null.group);
    OBJ_RETAIN(&ompi_mpi_errors_are_fatal.eh);

    /* initialize the comm_reg stuff for multi-threaded comm_cid
       allocation */
    ompi_comm_reg_init();

    return OMPI_SUCCESS;
}
Ejemplo n.º 19
0
/*
 * For a specific module, see if this proc has matching address/modex
 * info.  If so, create an endpoint and return it.
 *
 * Implementation note: This code relies on the order of modules on a local
 * side matching the order of the modex entries that we send around, otherwise
 * both sides may not agree on a bidirectional connection.  It also assumes
 * that add_procs will be invoked on the local modules in that same order, for
 * the same reason.  If those assumptions do not hold, we will need to
 * canonicalize this match ordering somehow, probably by (jobid,vpid) pair or
 * by the interface MAC or IP address.
 */
static int match_modex(ompi_btl_usnic_module_t *module,
                       ompi_btl_usnic_proc_t *proc,
                       int *index_out)
{
    int err = OMPI_SUCCESS;
    size_t i;
    uint32_t num_modules;
    ompi_btl_usnic_graph_t *g = NULL;
    int nme;
    int *me;
    bool proc_is_left;

    if (NULL == index_out) {
        return OMPI_ERR_BAD_PARAM;
    }
    *index_out = -1;

    num_modules = mca_btl_usnic_component.num_modules;

    opal_output_verbose(20, USNIC_OUT, "btl:usnic:%s: module=%p proc=%p with dimensions %d x %d",
                        __func__, (void *)module, (void *)proc,
                        num_modules, (int)proc->proc_modex_count);

    /* We compute an interface match-up table once for each (module,proc) pair
     * and cache it in the proc.  Store per-proc instead of per-module, since
     * MPI dynamic process routines can add procs but not new modules. */
    if (NULL == proc->proc_ep_match_table) {
        proc->proc_ep_match_table = malloc(num_modules *
                                       sizeof(*proc->proc_ep_match_table));
        if (NULL == proc->proc_ep_match_table) {
            OMPI_ERROR_LOG(OMPI_ERR_OUT_OF_RESOURCE);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        /* initialize to "no matches" */
        for (i = 0; i < num_modules; ++i) {
            proc->proc_ep_match_table[i] = -1;
        }

        /* For graphs where all edges are equal (and even for some other
         * graphs), two peers making matching calculations with "mirror image"
         * graphs might not end up with the same matching.  Ensure that both
         * sides are always setting up the exact same graph by always putting
         * the process with the lower (jobid,vpid) on the "left".
         */
        proc_is_left =
            (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                          &proc->proc_ompi->proc_name,
                                          &(ompi_proc_local()->proc_name)) < 0);

        err = create_proc_module_graph(proc, proc_is_left, &g);
        if (OMPI_SUCCESS != err) {
            goto out_free_table;
        }

        nme = 0;
        err = ompi_btl_usnic_solve_bipartite_assignment(g, &nme, &me);
        if (OMPI_SUCCESS != err) {
            OMPI_ERROR_LOG(err);
            goto out_free_graph;
        }

        edge_pairs_to_match_table(proc, proc_is_left, nme, me);

        err = ompi_btl_usnic_gr_free(g);
        if (OMPI_SUCCESS != err) {
            OMPI_ERROR_LOG(err);
            return err;
        }
    }


    if (!proc->proc_match_exists) {
        opal_output_verbose(5, USNIC_OUT, "btl:usnic:%s: unable to find any valid interface pairs for proc %s",
                            __func__, OMPI_NAME_PRINT(&proc->proc_ompi->proc_name));
        return OMPI_ERR_NOT_FOUND;
    }

    /* assuming no strange failure cases, this should always be present */
    if (NULL != proc->proc_ep_match_table && proc->proc_match_exists) {
        for (i = 0; i < num_modules; ++i) {
            if (module == mca_btl_usnic_component.usnic_active_modules[i]) {
                *index_out = proc->proc_ep_match_table[i];
                break;
            }
        }
    }

    /* If MTU does not match, throw an error */
    /* TODO with UDP, do we still want to enforce this restriction or just take
     * the min of the two MTUs?  Another choice is to disqualify this pairing
     * before running the matching algorithm on it. */
    if (*index_out >= 0 &&
        proc->proc_modex[*index_out].mtu != module->if_mtu) {
        opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch",
                    true,
                    ompi_process_info.nodename,
                    ibv_get_device_name(module->device),
                    module->port_num,
                    module->if_mtu,
                    (NULL == proc->proc_ompi->proc_hostname) ?
                    "unknown" : proc->proc_ompi->proc_hostname,
                    proc->proc_modex[*index_out].mtu);
        *index_out = -1;
        return OMPI_ERR_UNREACH;
    }

    return (*index_out == -1 ? OMPI_ERR_NOT_FOUND : OMPI_SUCCESS);

out_free_graph:
    ompi_btl_usnic_gr_free(g);
out_free_table:
    free(proc->proc_ep_match_table);
    proc->proc_ep_match_table = NULL;
    proc->proc_match_exists = false;
    return err;
}
Ejemplo n.º 20
0
/* Prints a few terse statistics lines via opal_output(0,...).  The first
 * line will be prefixed with the string "prefix".  If "reset_stats" is true
 * then the statistics will be reset after printing.
 *
 * NOTE: this routine ignores the setting of stats_enable, so it can be used
 * for debugging routines even when normal stats reporting is not enabled.
 */
void ompi_btl_usnic_print_stats(
    ompi_btl_usnic_module_t *module,
    const char *prefix,
    bool reset_stats)
{
    char tmp[128], str[2048];

    /* The usuals */
    snprintf(str, sizeof(str), "%s:MCW:%3u, ST(P+D)/F/C/R(T+F)/A:%8lu(%8u+%8u)/%8lu/%8lu/%4lu(%4lu+%4lu)/%8lu, RcvTot/Chk/F/C/L/H/D/BF/A:%8lu/%c%c/%8lu/%8lu/%4lu+%2lu/%4lu/%4lu/%6lu OA/DA %4lu/%4lu CRC:%4lu ",
             prefix,
             ompi_proc_local()->proc_name.vpid,

             module->stats.num_total_sends,
             module->mod_channels[USNIC_PRIORITY_CHANNEL].num_channel_sends,
             module->mod_channels[USNIC_DATA_CHANNEL].num_channel_sends,
             module->stats.num_frag_sends,
             module->stats.num_chunk_sends,
             module->stats.num_resends,
             module->stats.num_timeout_retrans,
             module->stats.num_fast_retrans,
             module->stats.num_ack_sends,

             module->stats.num_total_recvs,
             (module->stats.num_total_recvs -
              module->stats.num_recv_reposts) == 0 ? 'g' : 'B',
             (module->stats.num_total_recvs -
              module->stats.num_frag_recvs -
              module->stats.num_chunk_recvs -
              module->stats.num_badfrag_recvs -
              module->stats.num_oow_low_recvs -
              module->stats.num_oow_high_recvs -
              module->stats.num_dup_recvs -
              module->stats.num_ack_recvs -
              module->stats.num_unk_recvs) == 0 ? 'g' : 'B',
             module->stats.num_frag_recvs,
             module->stats.num_chunk_recvs,
             module->stats.num_oow_low_recvs,
             module->stats.num_oow_high_recvs,
             module->stats.num_dup_recvs,
             module->stats.num_badfrag_recvs,
             module->stats.num_ack_recvs,

             module->stats.num_old_dup_acks,
             module->stats.num_dup_acks,

             module->stats.num_crc_errors);

    /* If our PML calls were 0, then show send and receive window
       extents instead */
    if (module->stats.pml_module_sends +
        module->stats.pml_send_callbacks == 0) {
        int64_t send_unacked, su_min = WINDOW_SIZE * 2, su_max = 0;
        int64_t recv_depth, rd_min = WINDOW_SIZE * 2, rd_max = 0;
        ompi_btl_usnic_endpoint_t *endpoint;
        opal_list_item_t *item;

        rd_min = su_min = WINDOW_SIZE * 2;
        rd_max = su_max = 0;

        item = opal_list_get_first(&module->all_endpoints);
        while (item != opal_list_get_end(&(module->all_endpoints))) {
            endpoint = container_of(item, mca_btl_base_endpoint_t,
                    endpoint_endpoint_li);
            item = opal_list_get_next(item);

            /* Number of un-acked sends (i.e., sends for which we're
               still waiting for ACK) */
            send_unacked =
                endpoint->endpoint_next_seq_to_send -
                endpoint->endpoint_ack_seq_rcvd - 1;
            if (send_unacked > su_max) su_max = send_unacked;
            if (send_unacked < su_min) su_min = send_unacked;

            /* Receive window depth (i.e., difference between highest
               seq received and the next message we haven't ACKed
               yet) */
            recv_depth =
                endpoint->endpoint_highest_seq_rcvd -
                endpoint->endpoint_next_contig_seq_to_recv;
            if (recv_depth > rd_max) rd_max = recv_depth;
            if (recv_depth < rd_min) rd_min = recv_depth;
        }
        snprintf(tmp, sizeof(tmp), "PML S:%1ld, Win!A/R:%4ld/%4ld %4ld/%4ld",
                 module->stats.pml_module_sends,
                 su_min, su_max,
                 rd_min, rd_max);
    } else {
        snprintf(tmp, sizeof(tmp), "PML S/CB/Diff:%4lu/%4lu=%4ld",
                module->stats.pml_module_sends,
                module->stats.pml_send_callbacks,
                module->stats.pml_module_sends -
                 module->stats.pml_send_callbacks);
    }

    strncat(str, tmp, sizeof(str) - strlen(str) - 1);
    opal_output(0, "%s", str);

    if (reset_stats) {
        usnic_stats_reset(module);
    }
}
Ejemplo n.º 21
0
static mca_mtl_base_module_t*
ompi_mtl_psm_component_init(bool enable_progress_threads,
                           bool enable_mpi_threads)
{
    psm_error_t	err;
    int rc;
    int	verno_major = PSM_VERNO_MAJOR;
    int verno_minor = PSM_VERNO_MINOR;
    ompi_proc_t *my_proc, **procs;
    size_t num_total_procs, proc;
    int local_rank = -1, num_local_procs = 0;
    
    /* Compute the total number of processes on this host and our local rank
     * on that node. We need to provide PSM with these values so it can 
     * allocate hardware contexts appropriately across processes.
     */
    if ((rc = ompi_proc_refresh()) != OMPI_SUCCESS) {
      return NULL;
    }
    
    my_proc = ompi_proc_local();
    if (NULL == (procs = ompi_proc_world(&num_total_procs))) {
      return NULL;
    }
    
    for (proc = 0; proc < num_total_procs; proc++) {
      if (my_proc == procs[proc]) {
	local_rank = num_local_procs++;
	continue;
      }
      
      if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
	num_local_procs++;
      }
    }
    
    assert(local_rank >= 0 && num_local_procs > 0);
    free(procs);
    
    err = psm_error_register_handler(NULL /* no ep */,
			             PSM_ERRHANDLER_NOP);
    if (err) {
        opal_output(0, "Error in psm_error_register_handler (error %s)\n", 
		    psm_error_get_string(err));
	return NULL;
    }
    
#if PSM_VERNO >= 0x010c
    /* Set infinipath debug level */
    err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, 
		     (const void*) &ompi_mtl_psm.debug_level, 
		     sizeof(unsigned));
    if (err) {
      /* Non fatal error. Can continue */
      orte_show_help("help-mtl-psm.txt",
		     "psm init", false,
		     psm_error_get_string(err));
    }
#endif
    
    /* Only allow for shm and ipath devices in 2.0 and earlier releases 
     * (unless the user overrides the setting).
     */
    
    if (PSM_VERNO >= 0x0104) {
      setenv("PSM_DEVICES", "self,shm,ipath", 0);
    }
    else {
      setenv("PSM_DEVICES", "shm,ipath", 0);
    }
    
    err = psm_init(&verno_major, &verno_minor);
    if (err) {
      orte_show_help("help-mtl-psm.txt",
		     "psm init", true,
		     psm_error_get_string(err));
      return NULL;
    }
    
    /* Complete PSM initialization */
    ompi_mtl_psm_module_init(local_rank, num_local_procs);

    ompi_mtl_psm.super.mtl_request_size = 
      sizeof(mca_mtl_psm_request_t) - 
      sizeof(struct mca_mtl_request_t);
    
    return &ompi_mtl_psm.super;
}
Ejemplo n.º 22
0
int ompi_mtl_mxm_module_init(void)
{
#if MXM_API < MXM_VERSION(2,0)
    ompi_mtl_mxm_ep_conn_info_t ep_info;
#endif
    void *ep_address;
    size_t ep_address_len;
    mxm_error_t err;
    uint32_t jobid;
    uint64_t mxlr;
    ompi_proc_t *mp, **procs;
    unsigned ptl_bitmap;
    size_t totps, proc;
    int lr, nlps;
    int rc;

    mxlr = 0;
    lr = -1;

    mp = ompi_proc_local();
    jobid = ompi_mtl_mxm_get_job_id();
    if (0 == jobid) {
    	MXM_ERROR("Failed to generate jobid");
    	return OMPI_ERROR;
    }

    if (NULL == (procs = ompi_proc_world(&totps))) {
        MXM_ERROR("Unable to obtain process list");
        return OMPI_ERROR;
    }

    if (totps < (size_t)ompi_mtl_mxm.mxm_np) {
        MXM_VERBOSE(1, "MXM support will be disabled because of total number "
                    "of processes (%lu) is less than the minimum set by the "
                    "mtl_mxm_np MCA parameter (%u)", totps, ompi_mtl_mxm.mxm_np);
        return OMPI_ERR_NOT_SUPPORTED;
    }
    MXM_VERBOSE(1, "MXM support enabled");

    if (ORTE_NODE_RANK_INVALID == (lr = ompi_process_info.my_node_rank)) {
        MXM_ERROR("Unable to obtain local node rank");
        return OMPI_ERROR;
    }
    nlps = ompi_process_info.num_local_peers + 1;

    for (proc = 0; proc < totps; proc++) {
        if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
            mxlr = max(mxlr, procs[proc]->proc_name.vpid);
        }
    }

    /* Setup the endpoint options and local addresses to bind to. */
#if MXM_API < MXM_VERSION(1,5)
    ptl_bitmap = ompi_mtl_mxm.mxm_opts.ptl_bitmap;
#elif MXM_API < MXM_VERSION(2,0)
    ptl_bitmap = ompi_mtl_mxm.mxm_opts->ptl_bitmap;
#else
    ptl_bitmap = 0;
#endif

    /* Open MXM endpoint */
    err = ompi_mtl_mxm_create_ep(ompi_mtl_mxm.mxm_context, &ompi_mtl_mxm.ep,
                                 ptl_bitmap, lr, jobid, mxlr, nlps);
    if (MXM_OK != err) {
        opal_show_help("help-mtl-mxm.txt", "unable to create endpoint", true,
        		mxm_error_string(err));
        return OMPI_ERROR;
    }

    /*
     * Get address for each PTL on this endpoint, and share it with other ranks.
     */
#if MXM_API < MXM_VERSION(2,0)
    if ((ptl_bitmap & MXM_BIT(MXM_PTL_SELF)) &&
            OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SELF)) {
    	return OMPI_ERROR;
    }
    if ((ptl_bitmap & MXM_BIT(MXM_PTL_RDMA)) &&
            OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_RDMA)) {
    	return OMPI_ERROR;
    }
    if ((ptl_bitmap & MXM_BIT(MXM_PTL_SHM)) &&
            OMPI_SUCCESS != ompi_mtl_mxm_get_ep_address(&ep_info, MXM_PTL_SHM)) {
            return OMPI_ERROR;
    }

    ep_address = &ep_info;
    ep_address_len = sizeof(ep_info);
#else
    rc = ompi_mtl_mxm_get_ep_address(&ep_address, &ep_address_len);
    if (OMPI_SUCCESS != rc) {
        return rc;
    }
#endif

    rc = ompi_mtl_mxm_send_ep_address(ep_address, ep_address_len);
    if (OMPI_SUCCESS != rc) {
        MXM_ERROR("Modex session failed.");
        return rc;
    }

#if MXM_API >= MXM_VERSION(2,0)
    free(ep_address);
#endif
     
    /* Register the MXM progress function */
    opal_progress_register(ompi_mtl_mxm_progress);

#if MXM_API >= MXM_VERSION(2,0)
    if (ompi_mtl_mxm.using_mem_hooks) {
        opal_mem_hooks_register_release(ompi_mtl_mxm_mem_release_cb, NULL);
    }
#endif
    return OMPI_SUCCESS;
}
Ejemplo n.º 23
0
int
mca_btl_portals_add_procs(struct mca_btl_base_module_t* btl_base,
                          size_t nprocs, struct ompi_proc_t **procs,
                          struct mca_btl_base_endpoint_t** peers,
                          ompi_bitmap_t* reachable)
{
    int ret;
    struct ompi_proc_t *curr_proc = NULL;
    ptl_process_id_t *portals_procs = NULL;
    size_t i;
    unsigned long distance;
    bool need_activate = false;

    assert(&mca_btl_portals_module == (mca_btl_portals_module_t*) btl_base);
    opal_output_verbose(50, mca_btl_portals_component.portals_output,
                        "Adding %d procs (%d)", nprocs,
                        mca_btl_portals_module.portals_num_procs);

    /* if we havne't already, get our network handle */
    if (mca_btl_portals_module.portals_ni_h == PTL_INVALID_HANDLE) {
        ret = ompi_common_portals_ni_initialize(&mca_btl_portals_module.portals_ni_h);
        if (OMPI_SUCCESS != ret) return ret;
    }

    portals_procs = malloc(nprocs * sizeof(ptl_process_id_t));
    ret = ompi_common_portals_get_procs(nprocs, procs, portals_procs);
    if (OMPI_SUCCESS != ret) return ret;

    if (0 == mca_btl_portals_module.portals_num_procs) {
        need_activate = true;
    }

    /* loop through all procs, setting our reachable flag */
    for (i= 0; i < nprocs ; ++i) {
        curr_proc = procs[i];

        /* portals doesn't support heterogeneous yet... */
        if (ompi_proc_local()->proc_arch != curr_proc->proc_arch) {
            continue;
        }

        peers[i] = malloc(sizeof(mca_btl_base_endpoint_t));
        if (NULL == peers[i]) return OMPI_ERROR;
        *((mca_btl_base_endpoint_t*) peers[i]) = portals_procs[i];

        /* make sure we can reach the process - this is supposed to be
           a cheap-ish operation */
        ret = PtlNIDist(mca_btl_portals_module.portals_ni_h,
                        portals_procs[i],
                        &distance);
        if (ret != PTL_OK) {
            opal_output_verbose(10, mca_btl_portals_component.portals_output,
                                "Could not find distance to process %d", i);
            continue;
        }

        OPAL_THREAD_ADD32(&mca_btl_portals_module.portals_num_procs, 1);
        /* and here we can reach */
        ompi_bitmap_set_bit(reachable, i);
    }

    if (NULL != portals_procs) free(portals_procs);

    if (need_activate && mca_btl_portals_module.portals_num_procs > 0) {
        /* create eqs */
        int i;

        opal_output_verbose(50, mca_btl_portals_component.portals_output,
                            "Enabling progress");

        for (i = 0 ; i < OMPI_BTL_PORTALS_EQ_SIZE ; ++i) {
            int ptl_ret = PtlEQAlloc(mca_btl_portals_module.portals_ni_h,
                                     mca_btl_portals_module.portals_eq_sizes[i],
                                     PTL_EQ_HANDLER_NONE,
                                     &(mca_btl_portals_module.portals_eq_handles[i]));
            if (PTL_OK != ptl_ret) {
                opal_output(mca_btl_portals_component.portals_output,
                            "Error creating EQ %d: %d", i, ptl_ret);
                /* BWB - better error code? */
                return OMPI_ERROR;
            }
        }

        ret = mca_btl_portals_recv_enable(&mca_btl_portals_module);

        /* fill in send memory descriptor */
        mca_btl_portals_module.md_send.start = NULL;
        mca_btl_portals_module.md_send.length = 0;
        mca_btl_portals_module.md_send.threshold = PTL_MD_THRESH_INF;
        mca_btl_portals_module.md_send.max_size = 0;
        mca_btl_portals_module.md_send.options = PTL_MD_EVENT_START_DISABLE;
        mca_btl_portals_module.md_send.user_ptr = NULL;
        mca_btl_portals_module.md_send.eq_handle = 
            mca_btl_portals_module.portals_eq_handles[OMPI_BTL_PORTALS_EQ_SEND];
    } else {
        ret = OMPI_SUCCESS;
    }

    return ret;
}
Ejemplo n.º 24
0
int bcol_basesmuma_smcm_allgather_connection(
                                             mca_bcol_basesmuma_module_t *sm_bcol_module,
                                             mca_sbgp_base_module_t *module,
                                             opal_list_t *peer_list,
                                             bcol_basesmuma_smcm_proc_item_t ***back_files,
                                             ompi_communicator_t *comm,
                                             bcol_basesmuma_smcm_file_t input,
                                             char *base_fname,
                                             bool map_all)
{

    /* define local variables */

    int rc, i, fd;
    ptrdiff_t mem_offset;
    ompi_proc_t *proc_temp, *my_id;
    bcol_basesmuma_smcm_proc_item_t *temp;
    bcol_basesmuma_smcm_proc_item_t *item_ptr;
    bcol_basesmuma_smcm_proc_item_t **backing_files;
    struct file_info_t local_file;
    struct file_info_t *all_files=NULL;

    /* sanity check */
    if (strlen(input.file_name) > SM_BACKING_FILE_NAME_MAX_LEN-1) {
        opal_output (ompi_bcol_base_framework.framework_output, "backing file name too long:  %s len :: %d",
                     input.file_name, (int) strlen(input.file_name));
        return OMPI_ERR_BAD_PARAM;
    }

    backing_files = (bcol_basesmuma_smcm_proc_item_t **)
        calloc(module->group_size, sizeof(bcol_basesmuma_smcm_proc_item_t *));
    if (!backing_files) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* FIXME *back_files might have been already allocated
     * so free it in order to avoid a memory leak */
    if (NULL != *back_files) {
        free (*back_files);
    }
    *back_files = backing_files;

    my_id = ompi_proc_local();

    /* Phase One:
       gather a list of processes that will participate in the allgather - I'm
       preparing this list from the sbgp-ing module that was passed into the function */

    /* fill in local file information */
    local_file.vpid  = ((orte_process_name_t*)&my_id->super.proc_name)->vpid;
    local_file.jobid = ((orte_process_name_t*)&my_id->super.proc_name)->jobid;
    local_file.file_size=input.size;
    local_file.size_ctl_structure=input.size_ctl_structure;
    local_file.data_seg_alignment=input.data_seg_alignment;

    strcpy (local_file.file_name, input.file_name);

    /* will exchange this data type as a string of characters -
     * this routine is first called before MPI_init() completes
     * and before error handling is setup, so can't use the
     * MPI data types to send this data */
    all_files = (struct file_info_t *) calloc(module->group_size,
                                              sizeof (struct file_info_t));
    if (!all_files) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* exchange data */
    rc = comm_allgather_pml(&local_file,all_files,sizeof(struct file_info_t), MPI_CHAR,
                            sm_bcol_module->super.sbgp_partner_module->my_index,
                            sm_bcol_module->super.sbgp_partner_module->group_size,
                            sm_bcol_module->super.sbgp_partner_module->group_list,
                            sm_bcol_module->super.sbgp_partner_module->group_comm);
    if( OMPI_SUCCESS != rc ) {
        opal_output (ompi_bcol_base_framework.framework_output, "failed in comm_allgather_pml.  Error code: %d", rc);
        goto Error;
    }

    /* Phase four:
       loop through the receive buffer, unpack the data recieved from remote peers */

    for (i = 0; i < module->group_size; i++) {
        struct file_info_t *rem_file = all_files + i;

        /* check if this is my index or if the file is already mapped (set above). ther
         * is no reason to look through the peer list again because no two members of
         * the group will have the same vpid/jobid pair. ignore this previously found
         * mapping if map_all was requested (NTH: not sure why exactly since we re-map
         * and already mapped file) */
        if (sm_bcol_module->super.sbgp_partner_module->my_index == i) {
            continue;
        }

        proc_temp = ompi_comm_peer_lookup(comm,module->group_list[i]);

        OPAL_LIST_FOREACH(item_ptr, peer_list, bcol_basesmuma_smcm_proc_item_t) {
            /* if the vpid/jobid/filename combination already exists in the list,
               then do not map this peer's file --- because you already have */
            if (0 == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                                                  OMPI_CAST_RTE_NAME(&proc_temp->super.proc_name),
                                                  &item_ptr->peer) &&
                0 == strcmp (item_ptr->sm_file.file_name, rem_file->file_name)) {
                ++item_ptr->refcnt;
                /* record file data */
                backing_files[i] = item_ptr;
                break;
            }
        }

        if (!map_all && backing_files[i]) {
            continue;
        }

        temp = OBJ_NEW(bcol_basesmuma_smcm_proc_item_t);
        if (!temp) {
            rc = OMPI_ERR_OUT_OF_RESOURCE;
            goto Error;
        }

        temp->peer.vpid = rem_file->vpid;
        temp->peer.jobid = rem_file->jobid;

        temp->sm_file.file_name = strdup (rem_file->file_name);
        if (!temp->sm_file.file_name) {
            rc = OMPI_ERR_OUT_OF_RESOURCE;
            OBJ_RELEASE(temp);
            goto Error;
        }

        temp->sm_file.size = (size_t) rem_file->file_size;
        temp->sm_file.mpool_size = (size_t) rem_file->file_size;
        temp->sm_file.size_ctl_structure = (size_t) rem_file->size_ctl_structure;
        temp->sm_file.data_seg_alignment = (size_t) rem_file->data_seg_alignment;
        temp->refcnt = 1;

        /* Phase Five:
           If map_all == true, then  we map every peer's file
           else we check to see if I have already mapped this
           vpid/jobid/filename combination and if I have, then
           I do not mmap this peer's file.
           *
           */
        fd = open(temp->sm_file.file_name, O_RDWR, 0600);
        if (0 > fd) {
            opal_output (ompi_bcol_base_framework.framework_output, "SMCM Allgather failed to open sm backing file %s. errno = %d",
                         temp->sm_file.file_name, errno);
            rc = OMPI_ERROR;
            goto Error;
        }

        /* map the file */
        temp->sm_mmap = bcol_basesmuma_smcm_reg_mmap (NULL, fd, temp->sm_file.size,
                                                      temp->sm_file.size_ctl_structure,
                                                      temp->sm_file.data_seg_alignment,
                                                      temp->sm_file.file_name);
        close (fd);
        if (NULL == temp->sm_mmap) {
            opal_output (ompi_bcol_base_framework.framework_output, "mmapping failed to map remote peer's file");
            OBJ_RELEASE(temp);
            rc = OMPI_ERROR;
            goto Error;
        }

        /* compute memory offset */
        mem_offset = (ptrdiff_t) temp->sm_mmap->data_addr -
            (ptrdiff_t) temp->sm_mmap->map_seg;
        temp->sm_mmap->map_seg->seg_offset = mem_offset;
        temp->sm_mmap->map_seg->seg_size = temp->sm_file.size - mem_offset;
        /* more stuff to follow */

        /* append this peer's info, including shared memory map addr, onto the
           peer_list */

        /* record file data */
        backing_files[i] = (bcol_basesmuma_smcm_proc_item_t *) temp;

        opal_list_append(peer_list, (opal_list_item_t*) temp);
    }

    rc = OMPI_SUCCESS;

 Error:

    /* error clean-up and return */
    if (NULL != all_files) {
        free(all_files);
    }

    return rc;
}
Ejemplo n.º 25
0
static ompi_datatype_t* __ompi_datatype_create_from_packed_description( void** packed_buffer,
                                                                        const struct ompi_proc_t* remote_processor )
{
    int* position;
    ompi_datatype_t* datatype = NULL;
    ompi_datatype_t** array_of_datatype;
    OPAL_PTRDIFF_TYPE* array_of_disp;
    int* array_of_length;
    int number_of_length, number_of_disp, number_of_datatype, data_id;
    int create_type, i;
    char* next_buffer;

#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    bool need_swap = false;

    if( (remote_processor->super.proc_arch ^ ompi_proc_local()->super.proc_arch) &
        OPAL_ARCH_ISBIGENDIAN ) {
        need_swap = true;
    }
#endif

    next_buffer = (char*)*packed_buffer;
    position = (int*)next_buffer;

    create_type = position[0];
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    if (need_swap) {
        create_type = opal_swap_bytes4(create_type);
    }
#endif
    if( MPI_COMBINER_NAMED == create_type ) {
        /* there we have a simple predefined datatype */
        data_id = position[1];
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
        if (need_swap) {
            data_id = opal_swap_bytes4(data_id);
        }
#endif
        assert( data_id < OMPI_DATATYPE_MAX_PREDEFINED );
        *packed_buffer = position + 2;
        return (ompi_datatype_t*)ompi_datatype_basicDatatypes[data_id];
    }

    number_of_length   = position[1];
    number_of_disp     = position[2];
    number_of_datatype = position[3];
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    if (need_swap) {
        number_of_length   = opal_swap_bytes4(number_of_length);
        number_of_disp     = opal_swap_bytes4(number_of_disp);
        number_of_datatype = opal_swap_bytes4(number_of_datatype);
    }
#endif
    array_of_datatype = (ompi_datatype_t**)malloc( sizeof(ompi_datatype_t*) *
                                                   number_of_datatype );
    next_buffer += (4 * sizeof(int));  /* move after the header */

    /* description of the displacements (if ANY !)  should always be aligned
       on MPI_Aint, aka OPAL_PTRDIFF_TYPE */
    if (number_of_disp > 0) {
        OMPI_DATATYPE_ALIGN_PTR(next_buffer, char*);
    }

    array_of_disp   = (OPAL_PTRDIFF_TYPE*)next_buffer;
    next_buffer    += number_of_disp * sizeof(OPAL_PTRDIFF_TYPE);
    /* the other datatypes */
    position        = (int*)next_buffer;
    next_buffer    += number_of_datatype * sizeof(int);
    /* the array of lengths (32 bits aligned) */
    array_of_length = (int*)next_buffer;
    next_buffer    += (number_of_length * sizeof(int));

    for( i = 0; i < number_of_datatype; i++ ) {
        data_id = position[i];
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
        if (need_swap) {
            data_id = opal_swap_bytes4(data_id);
        }
#endif
        if( data_id < OMPI_DATATYPE_MAX_PREDEFINED ) {
            array_of_datatype[i] = (ompi_datatype_t*)ompi_datatype_basicDatatypes[data_id];
            continue;
        }
        array_of_datatype[i] =
            __ompi_datatype_create_from_packed_description( (void**)&next_buffer,
                                                            remote_processor );
        if( NULL == array_of_datatype[i] ) {
            /* don't cleanup more than required. We can now modify these
             * values as we already know we have failed to rebuild the
             * datatype.
             */
            array_of_datatype[i] = (ompi_datatype_t*)ompi_datatype_basicDatatypes[OPAL_DATATYPE_INT1]; /*XXX TODO */
            number_of_datatype = i;
            goto cleanup_and_exit;
        }
    }

#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    if (need_swap) {
        for (i = 0 ; i < number_of_length ; ++i) {
            array_of_length[i] = opal_swap_bytes4(array_of_length[i]);
        }
        for (i = 0 ; i < number_of_disp ; ++i) {
#if SIZEOF_PTRDIFF_T == 4
            array_of_disp[i] = opal_swap_bytes4(array_of_disp[i]);
#elif SIZEOF_PTRDIFF_T == 8
            array_of_disp[i] = (MPI_Aint)opal_swap_bytes8(array_of_disp[i]);
#else
#error "Unknown size of ptrdiff_t"
#endif
        }
    }
#endif
    datatype = __ompi_datatype_create_from_args( array_of_length, array_of_disp,
                                                 array_of_datatype, create_type );
    *packed_buffer = next_buffer;
 cleanup_and_exit:
    for( i = 0; i < number_of_datatype; i++ ) {
        if( !(ompi_datatype_is_predefined(array_of_datatype[i])) ) {
            OBJ_RELEASE(array_of_datatype[i]);
        }
    }
    free( array_of_datatype );
    return datatype;
}
Ejemplo n.º 26
0
int base_bcol_basesmuma_exchange_offsets(
    mca_bcol_basesmuma_module_t *sm_bcol_module,
    void **result_array, uint64_t mem_offset, int loop_limit,
    int leading_dim)
{
    int ret=OMPI_SUCCESS,i,dummy;
    int index_in_group, pcnt;
    opal_list_t peers;
    ompi_namelist_t *peer;
    ompi_proc_t *proc_temp, *my_id;
    opal_buffer_t *send_buffer = OBJ_NEW(opal_buffer_t);
    opal_buffer_t *recv_buffer = OBJ_NEW(opal_buffer_t);
    uint64_t rem_mem_offset;

    /*  exchange the base pointer for the controls structures - gather
     *  every one else's infromation.
     */
    /* get list of procs that will participate in the communication */
    OBJ_CONSTRUCT(&peers, opal_list_t);
    for (i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++) {
        /* get the proc info */
        proc_temp = ompi_comm_peer_lookup(
                sm_bcol_module->super.sbgp_partner_module->group_comm,
                sm_bcol_module->super.sbgp_partner_module->group_list[i]);
        peer = OBJ_NEW(ompi_namelist_t);
        peer->name.jobid = proc_temp->proc_name.jobid;
        peer->name.vpid = proc_temp->proc_name.vpid;
        opal_list_append(&peers,&peer->super); /* this is with the new field called "super" in ompi_namelist_t struct */
    }
    /* pack up the data into the allgather send buffer */
        if (NULL == send_buffer || NULL == recv_buffer) {
            opal_output (ompi_bcol_base_framework.framework_output, "Cannot allocate memory for sbuffer or rbuffer\n");
            ret = OMPI_ERROR;
            goto exit_ERROR;
        }

    /* get my proc information */
    my_id = ompi_proc_local();

    /* pack my information */
    ret = opal_dss.pack(send_buffer,
        &(sm_bcol_module->super.sbgp_partner_module->my_index),1,OPAL_UINT32);

    if (OMPI_SUCCESS != ret) {
        opal_output (ompi_bcol_base_framework.framework_output, "Error packing my_index!!\n");
        goto exit_ERROR;
    }

    /* pack the offset of the allocated region */
    ret = opal_dss.pack(send_buffer,&(mem_offset),1,OPAL_UINT64);
    if (OMPI_SUCCESS != ret) {
        goto exit_ERROR;
    }

    /* get the offsets from all procs, so can setup the control data
     * structures.
     */
    if (OMPI_SUCCESS != (ret = ompi_rte_allgather_list(&peers, send_buffer, recv_buffer))) {
        opal_output (ompi_bcol_base_framework.framework_output, "ompi_rte_allgather_list returned error %d\n", ret);
        goto exit_ERROR;
    }

        /* unpack the dummy */
        pcnt=1;
        ret = opal_dss.unpack(recv_buffer,&dummy, &pcnt, OPAL_INT32);
        if (OMPI_SUCCESS != ret) {
                opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for dummy\n",ret);
                goto exit_ERROR;
        }

    /* get the control stucture offsets within the shared memory
     *   region and populate the control structures - we do not assume
     *   any symmetry in memory layout of each process
     */

    /* loop over the procs in the group */
    for(i = 0; i < sm_bcol_module->super.sbgp_partner_module->group_size; i++){
        int array_id;
        pcnt=1;
        ret = opal_dss.unpack(recv_buffer,&index_in_group, &pcnt, OPAL_UINT32);
        if (OMPI_SUCCESS != ret) {
            opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote index_in_group\n",ret);
            goto exit_ERROR;
        }

        /* get the offset */
        pcnt=1;
        ret = opal_dss.unpack(recv_buffer,&rem_mem_offset, &pcnt, OPAL_UINT64);
        if (OMPI_SUCCESS != ret) {
            opal_output (ompi_bcol_base_framework.framework_output, "unpack returned error %d for remote memory offset\n",ret);
            goto exit_ERROR;
        }

        array_id=SM_ARRAY_INDEX(leading_dim,0,index_in_group);
        result_array[array_id]=(void *)rem_mem_offset;

    }

    /* clean up */
    peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
    while( NULL !=peer) {
        OBJ_RELEASE(peer);
        peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
    }
    OBJ_DESTRUCT(&peers);
    if( send_buffer ) {
        OBJ_RELEASE(send_buffer);
    }
    if( recv_buffer ) {
        OBJ_RELEASE(recv_buffer);
    }

    return ret;

exit_ERROR:

    /* free peer list */
    peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
    while( NULL !=peer) {
        OBJ_RELEASE(peer);
        peer=(ompi_namelist_t *)opal_list_remove_first(&peers);
    }
    OBJ_DESTRUCT(&peers);
    if( send_buffer ) {
        OBJ_RELEASE(send_buffer);
    }
    if( recv_buffer ) {
        OBJ_RELEASE(recv_buffer);
    }
    return ret;
}
Ejemplo n.º 27
0
int mca_btl_sm_add_procs(
    struct mca_btl_base_module_t* btl,
    size_t nprocs,
    struct ompi_proc_t **procs,
    struct mca_btl_base_endpoint_t **peers,
    opal_bitmap_t* reachability)
{
    int return_code = OMPI_SUCCESS;
    int32_t n_local_procs = 0, proc, j, my_smp_rank = -1;
    ompi_proc_t* my_proc; /* pointer to caller's proc structure */
    mca_btl_sm_t *sm_btl;
    bool have_connected_peer = false;
    char **bases;
    /* for easy access to the mpool_sm_module */
    mca_mpool_sm_module_t *sm_mpool_modp = NULL;

    /* initializion */

    sm_btl = (mca_btl_sm_t *)btl;

    /* get pointer to my proc structure */
    if(NULL == (my_proc = ompi_proc_local()))
        return OMPI_ERR_OUT_OF_RESOURCE;

    /* Get unique host identifier for each process in the list,
     * and idetify procs that are on this host.  Add procs on this
     * host to shared memory reachbility list.  Also, get number
     * of local procs in the procs list. */
    for (proc = 0; proc < (int32_t)nprocs; proc++) {
        /* check to see if this proc can be reached via shmem (i.e.,
           if they're on my local host and in my job) */
        if (procs[proc]->proc_name.jobid != my_proc->proc_name.jobid ||
            !OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) {
            peers[proc] = NULL;
            continue;
        }

        /* check to see if this is me */
        if(my_proc == procs[proc]) {
            my_smp_rank = mca_btl_sm_component.my_smp_rank = n_local_procs++;
            continue;
        }

         /* sm doesn't support heterogeneous yet... */
        if (procs[proc]->proc_arch != my_proc->proc_arch) {
            continue;
        }

        /* we have someone to talk to */
        have_connected_peer = true;

        if(!(peers[proc] = create_sm_endpoint(n_local_procs, procs[proc]))) {
            return_code = OMPI_ERROR;
            goto CLEANUP;
        }
        n_local_procs++;

        /* add this proc to shared memory accessibility list */
        return_code = opal_bitmap_set_bit(reachability, proc);
        if(OMPI_SUCCESS != return_code)
            goto CLEANUP;
    }

    /* jump out if there's not someone we can talk to */
    if (!have_connected_peer)
        goto CLEANUP;

    /* make sure that my_smp_rank has been defined */
    if (-1 == my_smp_rank) {
        return_code = OMPI_ERROR;
        goto CLEANUP;
    }

    if (!sm_btl->btl_inited) {
        return_code =
            sm_btl_first_time_init(sm_btl, my_smp_rank,
                                   mca_btl_sm_component.sm_max_procs);
        if (return_code != OMPI_SUCCESS) {
            goto CLEANUP;
        }
    }

    /* set local proc's smp rank in the peers structure for
     * rapid access and calculate reachability */
    for(proc = 0; proc < (int32_t)nprocs; proc++) {
        if(NULL == peers[proc])
            continue;
        mca_btl_sm_component.sm_peers[peers[proc]->peer_smp_rank] = peers[proc];
        peers[proc]->my_smp_rank = my_smp_rank;
    }

    bases = mca_btl_sm_component.shm_bases;
    sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_sm_component.sm_mpool;

    /* initialize own FIFOs */
    /*
     * The receiver initializes all its FIFOs.  All components will
     * be allocated near the receiver.  Nothing will be local to
     * "the sender" since there will be many senders.
     */
    for(j = mca_btl_sm_component.num_smp_procs;
        j < mca_btl_sm_component.num_smp_procs + FIFO_MAP_NUM(n_local_procs); j++) {

        return_code = sm_fifo_init( mca_btl_sm_component.fifo_size,
                                    mca_btl_sm_component.sm_mpool,
                                   &mca_btl_sm_component.fifo[my_smp_rank][j],
                                    mca_btl_sm_component.fifo_lazy_free);
        if(return_code != OMPI_SUCCESS)
            goto CLEANUP;
    }

    opal_atomic_wmb();

    /* Sync with other local procs. Force the FIFO initialization to always
     * happens before the readers access it.
     */
    opal_atomic_add_32(&mca_btl_sm_component.sm_seg->module_seg->seg_inited, 1);
    while( n_local_procs >
           mca_btl_sm_component.sm_seg->module_seg->seg_inited) {
        opal_progress();
        opal_atomic_rmb();
    }

    /* it is now safe to unlink the shared memory segment. only one process
     * needs to do this, so just let smp rank zero take care of it. */
    if (0 == my_smp_rank) {
        if (OMPI_SUCCESS !=
            mca_common_sm_module_unlink(mca_btl_sm_component.sm_seg)) {
            /* it is "okay" if this fails at this point. we have gone this far,
             * so just warn about the failure and continue. this is probably
             * only triggered by a programming error. */
            opal_output(0, "WARNING: common_sm_module_unlink failed.\n");
        }
        /* SKG - another abstraction violation here, but I don't want to add
         * extra code in the sm mpool for further synchronization. */

        /* at this point, all processes have attached to the mpool segment. so
         * it is safe to unlink it here. */
        if (OMPI_SUCCESS !=
            mca_common_sm_module_unlink(sm_mpool_modp->sm_common_module)) {
            opal_output(0, "WARNING: common_sm_module_unlink failed.\n");
        }
        if (-1 == unlink(mca_btl_sm_component.sm_mpool_rndv_file_name)) {
            opal_output(0, "WARNING: %s unlink failed.\n",
                        mca_btl_sm_component.sm_mpool_rndv_file_name);
        }
        if (-1 == unlink(mca_btl_sm_component.sm_rndv_file_name)) {
            opal_output(0, "WARNING: %s unlink failed.\n",
                        mca_btl_sm_component.sm_rndv_file_name);
        }
    }

    /* free up some space used by the name buffers */
    free(mca_btl_sm_component.sm_mpool_ctl_file_name);
    free(mca_btl_sm_component.sm_mpool_rndv_file_name);
    free(mca_btl_sm_component.sm_ctl_file_name);
    free(mca_btl_sm_component.sm_rndv_file_name);

    /* coordinate with other processes */
    for(j = mca_btl_sm_component.num_smp_procs;
        j < mca_btl_sm_component.num_smp_procs + n_local_procs; j++) {
        ptrdiff_t diff;

        /* spin until this element is allocated */
        /* doesn't really wait for that process... FIFO might be allocated, but not initialized */
        opal_atomic_rmb();
        while(NULL == mca_btl_sm_component.shm_fifo[j]) {
            opal_progress();
            opal_atomic_rmb();
        }

        /* Calculate the difference as (my_base - their_base) */
        diff = ADDR2OFFSET(bases[my_smp_rank], bases[j]);

        /* store local address of remote fifos */
        mca_btl_sm_component.fifo[j] =
            (sm_fifo_t*)OFFSET2ADDR(diff, mca_btl_sm_component.shm_fifo[j]);

        /* cache local copy of peer memory node number */
        mca_btl_sm_component.mem_nodes[j] = mca_btl_sm_component.shm_mem_nodes[j];
    }

    /* update the local smp process count */
    mca_btl_sm_component.num_smp_procs += n_local_procs;

    /* make sure we have enough eager fragmnents for each process */
    return_code = ompi_free_list_resize_mt(&mca_btl_sm_component.sm_frags_eager,
                                           mca_btl_sm_component.num_smp_procs * 2);
    if (OMPI_SUCCESS != return_code)
        goto CLEANUP;

CLEANUP:
    return return_code;
}
Ejemplo n.º 28
0
int mca_pml_dr_add_procs(ompi_proc_t** procs, size_t nprocs)
{
    opal_bitmap_t reachable;
    int rc;
    size_t i;
    opal_list_item_t *item;

    if(nprocs == 0)
        return OMPI_SUCCESS;

#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    for (i = 0 ; i < nprocs ; ++i) {
        if (procs[i]->proc_arch != ompi_proc_local()->proc_arch) {
            return OMPI_ERR_NOT_SUPPORTED;
        }
    }
#endif

    /* make sure remote procs are using the same PML as us */
    if (OMPI_SUCCESS != (rc = mca_pml_base_pml_check_selected("dr",
                                                              procs,
                                                              nprocs))) {
        return rc;
    }

    OBJ_CONSTRUCT(&reachable, opal_bitmap_t);
    rc = opal_bitmap_init(&reachable, (int)nprocs);
    if(OMPI_SUCCESS != rc)
        return rc;

    /* initialize bml endpoint data */
    rc = mca_bml.bml_add_procs(
                               nprocs,
                               procs,
                               &reachable
                               );
    if(OMPI_SUCCESS != rc)
        return rc;

    /* Check that values supplied by all initialized btls will work
       for us.  Note that this is the list of all initialized BTLs,
       not the ones used for the just added procs.  This is a little
       overkill and inaccurate, as we may end up not using the BTL in
       question and all add_procs calls after the first one are
       duplicating an already completed check.  But the final
       initialization of the PML occurs before the final
       initialization of the BTLs, and iterating through the in-use
       BTLs requires iterating over the procs, as the BML does not
       expose all currently in use btls. */

    for (item = opal_list_get_first(&mca_btl_base_modules_initialized) ;
         item != opal_list_get_end(&mca_btl_base_modules_initialized) ;
         item = opal_list_get_next(item)) {
        mca_btl_base_selected_module_t *sm = 
            (mca_btl_base_selected_module_t*) item;
        if (sm->btl_module->btl_eager_limit < sizeof(mca_pml_dr_hdr_t)) {
	    orte_show_help("help-mpi-pml-dr.txt", "eager_limit_too_small",
			   true, 
			   sm->btl_component->btl_version.mca_component_name,
			   orte_process_info.nodename,
			   sm->btl_component->btl_version.mca_component_name,
			   sm->btl_module->btl_eager_limit,
			   sm->btl_component->btl_version.mca_component_name,
			   sizeof(mca_pml_dr_hdr_t),
			   sm->btl_component->btl_version.mca_component_name);
            rc = OMPI_ERR_BAD_PARAM;
            return rc;
        }
    }

    /* register recv handler */
    rc = mca_bml.bml_register(
                              MCA_BTL_TAG_PML,
                              mca_pml_dr_recv_frag_callback,
                              NULL);

    if(OMPI_SUCCESS != rc)
        return rc;

    /* register error handlers */
    rc = mca_bml.bml_register_error(mca_pml_dr_error_handler);
    
    if(OMPI_SUCCESS != rc)
        return rc;
 
    ompi_free_list_init_new(
                        &mca_pml_dr.buffers,
                        sizeof(mca_pml_dr_buffer_t) + mca_pml_dr.eager_limit,
                        opal_cache_line_size,
                        OBJ_CLASS(mca_pml_dr_buffer_t),
                        0,opal_cache_line_size,
                        0,
                        mca_pml_dr.free_list_max,
                        mca_pml_dr.free_list_inc,
                        NULL);

    /* initialize pml endpoint data */
    for (i = 0 ; i < nprocs ; ++i) {
        int idx;
        mca_pml_dr_endpoint_t *endpoint;

        endpoint = OBJ_NEW(mca_pml_dr_endpoint_t);
        endpoint->proc_ompi = procs[i];
        procs[i]->proc_pml = (struct mca_pml_endpoint_t*) endpoint;
        MCA_PML_DR_DEBUG(10, (0, "%s:%d: adding endpoint %p to proc_pml %p\n", 
                              __FILE__, __LINE__, (void*)endpoint, (void*)procs[i]));
        
        /* this won't work for comm spawn and other dynamic
           processes, but will work for initial job start */
        idx = opal_pointer_array_add(&mca_pml_dr.endpoints, (void*) endpoint);
        if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                           ORTE_PROC_MY_NAME,
                           &(endpoint->proc_ompi->proc_name))) {
            mca_pml_dr.my_rank = idx;
        }
        endpoint->local = endpoint->dst = idx;
        MCA_PML_DR_DEBUG(10, (0, "%s:%d: setting endpoint->dst to %d\n", 
                              __FILE__, __LINE__, idx));
        
        endpoint->bml_endpoint = procs[i]->proc_bml;
    }
    
    for(i = 0; i < nprocs; i++) { 
        mca_pml_dr_endpoint_t* ep =  (mca_pml_dr_endpoint_t*) 
            opal_pointer_array_get_item(&mca_pml_dr.endpoints, i);
            ep->src = mca_pml_dr.my_rank;
    }
    return rc;
}