Beispiel #1
0
/*
 * Called when the CPC has established a connection on an endpoint
 */
void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
{
    /* If the CPC uses the CTS protocol, then start it up */
    if (endpoint->endpoint_local_cpc->cbm_uses_cts) {
        int transport_type_ib_p = 0;
        /* Post our receives, which will make credit management happy
           (i.e., rd_credits will be 0) */
        if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_recvs(endpoint)) {
            BTL_ERROR(("Failed to post receive buffers"));
            mca_btl_openib_endpoint_invoke_error(endpoint);
            return;
        }
        endpoint->endpoint_posted_recvs = true;

        /* If this is IB, send the CTS immediately.  If this is iWARP,
           then only send the CTS if this endpoint was the initiator
           of the connection (the receiver will send its CTS when it
           receives this side's CTS).  Also send the CTS if we already
           received the peer's CTS (e.g., if this process was slow to
           call cpc_complete(). */
#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE)
        transport_type_ib_p = (IBV_TRANSPORT_IB == endpoint->endpoint_btl->device->ib_dev->transport_type);
#endif
        OPAL_OUTPUT((-1, "cpc_complete to peer %s: is IB %d, initiatior %d, cts received: %d",
                     opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
                     transport_type_ib_p,
                     endpoint->endpoint_initiator,
                     endpoint->endpoint_cts_received));
        if (transport_type_ib_p ||
            endpoint->endpoint_initiator ||
            endpoint->endpoint_cts_received) {
            mca_btl_openib_endpoint_send_cts(endpoint);

            /* If we've already got the CTS from the other side, then
               mark us as connected */
            if (endpoint->endpoint_cts_received) {
                OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
                             opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
                mca_btl_openib_endpoint_connected(endpoint);
            }
        }

        OPAL_OUTPUT((-1, "cpc_complete to %s -- done",
                     opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
        return;
    }

    /* Otherwise, just set the endpoint to "connected" */
    mca_btl_openib_endpoint_connected(endpoint);
}
Beispiel #2
0
static void cts_sent(mca_btl_base_module_t* btl,
                     struct mca_btl_base_endpoint_t* ep,
                     struct mca_btl_base_descriptor_t* des,
                     int status)
{
    /* Nothing to do/empty function (we can't pass in a NULL pointer
       for the des_cbfunc) */
    OPAL_OUTPUT((-1, "CTS send to %s completed",
                 opal_get_proc_hostname(ep->endpoint_proc->proc_opal)));
}
int opal_btl_openib_connect_base_alloc_cts(mca_btl_base_endpoint_t *endpoint)
{
    opal_free_list_item_t *fli;
    int length = sizeof(mca_btl_openib_header_t) +
        sizeof(mca_btl_openib_header_coalesced_t) +
        sizeof(mca_btl_openib_control_header_t) +
        sizeof(mca_btl_openib_footer_t) +
        mca_btl_openib_component.qp_infos[mca_btl_openib_component.credits_qp].size;

    /* Explicitly don't use the mpool registration */
    fli = &(endpoint->endpoint_cts_frag.super.super.base.super);
    fli->registration = NULL;
    fli->ptr = malloc(length);
    if (NULL == fli->ptr) {
        BTL_ERROR(("malloc failed"));
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    endpoint->endpoint_cts_mr =
        ibv_reg_mr(endpoint->endpoint_btl->device->ib_pd,
                   fli->ptr, length,
                   IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
                   IBV_ACCESS_REMOTE_READ);
    OPAL_OUTPUT((-1, "registered memory %p, length %d", fli->ptr, length));
    if (NULL == endpoint->endpoint_cts_mr) {
        free(fli->ptr);
        BTL_ERROR(("Failed to reg mr!"));
        return OPAL_ERR_OUT_OF_RESOURCE;
    }
    /* NOTE: We do not need to register this memory with the
       opal_memory subsystem, because this is OMPI-controlled memory
       -- we do not need to worry about this memory being freed out
       from underneath us. */

    /* Copy the lkey where it needs to go */
    endpoint->endpoint_cts_frag.super.sg_entry.lkey =
        endpoint->endpoint_cts_mr->lkey;
    endpoint->endpoint_cts_frag.super.sg_entry.length = length;

    /* Construct the rest of the recv_frag_t */
    OBJ_CONSTRUCT(&(endpoint->endpoint_cts_frag), mca_btl_openib_recv_frag_t);
    endpoint->endpoint_cts_frag.super.super.base.order =
        mca_btl_openib_component.credits_qp;
    endpoint->endpoint_cts_frag.super.endpoint = endpoint;
    OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d",
                 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
                 (void*) endpoint->endpoint_cts_frag.super.sg_entry.addr,
                 endpoint->endpoint_cts_frag.super.sg_entry.length,
                 endpoint->endpoint_cts_frag.super.sg_entry.lkey));

    return OPAL_SUCCESS;
}
Beispiel #4
0
/*
 * Send CTS control fragment
 */
void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
{
    mca_btl_openib_send_control_frag_t *sc_frag;
    mca_btl_base_descriptor_t *base_des;
    mca_btl_openib_frag_t *openib_frag;
    mca_btl_openib_com_frag_t *com_frag;
    mca_btl_openib_control_header_t *ctl_hdr;

    OPAL_OUTPUT((-1, "SENDING CTS to %s on qp index %d (QP num %d)",
                 opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal),
                 mca_btl_openib_component.credits_qp,
                 endpoint->qps[mca_btl_openib_component.credits_qp].qp->lcl_qp->qp_num));
    sc_frag = alloc_control_frag(endpoint->endpoint_btl);
    if (OPAL_UNLIKELY(NULL == sc_frag)) {
        BTL_ERROR(("Failed to allocate control buffer"));
        mca_btl_openib_endpoint_invoke_error(endpoint);
        return;
    }

    /* I dislike using the "to_<foo>()" macros; I prefer using the
       explicit member fields to ensure I get the types right.  Since
       this is not a performance-criticial part of the code, it's
       ok. */
    com_frag = &(sc_frag->super.super);
    openib_frag = &(com_frag->super);
    base_des = &(openib_frag->base);

    base_des->des_cbfunc = cts_sent;
    base_des->des_cbdata = NULL;
    base_des->des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
    base_des->order = mca_btl_openib_component.credits_qp;
    openib_frag->segment.seg_len = sizeof(mca_btl_openib_control_header_t);
    com_frag->endpoint = endpoint;

    sc_frag->hdr->tag = MCA_BTL_TAG_IB;
    sc_frag->hdr->cm_seen = 0;
    sc_frag->hdr->credits = 0;

    ctl_hdr = (mca_btl_openib_control_header_t*)
        openib_frag->segment.seg_addr.pval;
    ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS;

    /* Send the fragment */
    OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
    if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) {
        BTL_ERROR(("Failed to post CTS send"));
        mca_btl_openib_endpoint_invoke_error(endpoint);
    }
    endpoint->endpoint_cts_sent = true;
    OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
}
/*
 * Note that this routine must be called with the lock on the process
 * already held.  Insert a btl instance into the proc array and assign
 * it an address.
 */
int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
                             mca_btl_base_endpoint_t* btl_endpoint )
{
    struct sockaddr_storage endpoint_addr_ss;
    const char *proc_hostname;
    unsigned int perm_size;
    int rc, *a = NULL;
    size_t i, j;
    mca_btl_tcp_interface_t** peer_interfaces;
    mca_btl_tcp_proc_data_t _proc_data, *proc_data=&_proc_data;
    size_t max_peer_interfaces;
    memset(proc_data, 0, sizeof(mca_btl_tcp_proc_data_t));

    if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) {
        return OPAL_ERR_UNREACH;
    }

#ifndef WORDS_BIGENDIAN
    /* if we are little endian and our peer is not so lucky, then we
       need to put all information sent to him in big endian (aka
       Network Byte Order) and expect all information received to
       be in NBO.  Since big endian machines always send and receive
       in NBO, we don't care so much about that case. */
    if (btl_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) {
        btl_endpoint->endpoint_nbo = true;
    }
#endif

    /* insert into endpoint array */
    btl_endpoint->endpoint_proc = btl_proc;
    btl_proc->proc_endpoints[btl_proc->proc_endpoint_count++] = btl_endpoint;

    /* sanity checks */
    if( NULL == mca_btl_tcp_retrieve_local_interfaces(proc_data) )
        return OPAL_ERR_OUT_OF_RESOURCE;
    if( 0 == proc_data->num_local_interfaces ) {
        return OPAL_ERR_UNREACH;
    }

    max_peer_interfaces = proc_data->max_local_interfaces;
    peer_interfaces = (mca_btl_tcp_interface_t**)calloc( max_peer_interfaces, sizeof(mca_btl_tcp_interface_t*) );
    assert(NULL != peer_interfaces);
    proc_data->num_peer_interfaces = 0;
    memset(proc_data->peer_kindex_to_index, -1, sizeof(int)*MAX_KERNEL_INTERFACE_INDEX);

    /*
     * identify all kernel interfaces and the associated addresses of
     * the peer
     */

    for( i = 0; i < btl_proc->proc_addr_count; i++ ) {

        int index;

        mca_btl_tcp_addr_t* endpoint_addr = btl_proc->proc_addrs + i;

        mca_btl_tcp_proc_tosocks (endpoint_addr, &endpoint_addr_ss);

        index = proc_data->peer_kindex_to_index[endpoint_addr->addr_ifkindex];

        if(-1 == index) {
            index = proc_data->num_peer_interfaces++;
            proc_data->peer_kindex_to_index[endpoint_addr->addr_ifkindex] = index;
            if( proc_data->num_peer_interfaces == max_peer_interfaces ) {
                max_peer_interfaces <<= 1;
                peer_interfaces = (mca_btl_tcp_interface_t**)realloc( peer_interfaces,
                                                                      max_peer_interfaces * sizeof(mca_btl_tcp_interface_t*) );
                if( NULL == peer_interfaces )
                    return OPAL_ERR_OUT_OF_RESOURCE;
            }
            peer_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t));
            mca_btl_tcp_initialise_interface(peer_interfaces[index],
                                             endpoint_addr->addr_ifkindex, index);
        }

        /*
         * in case one of the peer addresses is already in use,
         * mark the complete peer interface as 'not available'
         */
        if(endpoint_addr->addr_inuse) {
            peer_interfaces[index]->inuse = 1;
        }

        switch(endpoint_addr_ss.ss_family) {
        case AF_INET:
            peer_interfaces[index]->ipv4_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss));
            peer_interfaces[index]->ipv4_endpoint_addr = endpoint_addr;
            memcpy(peer_interfaces[index]->ipv4_address,
                   &endpoint_addr_ss, sizeof(endpoint_addr_ss));
            break;
        case AF_INET6:
            peer_interfaces[index]->ipv6_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss));
            peer_interfaces[index]->ipv6_endpoint_addr = endpoint_addr;
            memcpy(peer_interfaces[index]->ipv6_address,
                   &endpoint_addr_ss, sizeof(endpoint_addr_ss));
            break;
        default:
            opal_output(0, "unknown address family for tcp: %d\n",
                        endpoint_addr_ss.ss_family);
            /*
             * return OPAL_UNREACH or some error, as this is not
             * good
             */
        }
    }

    /*
     * assign weights to each possible pair of interfaces
     */

    perm_size = proc_data->num_local_interfaces;
    if(proc_data->num_peer_interfaces > perm_size) {
        perm_size = proc_data->num_peer_interfaces;
    }

    proc_data->weights = (enum mca_btl_tcp_connection_quality**) malloc(perm_size
                                                             * sizeof(enum mca_btl_tcp_connection_quality*));
    assert(NULL != proc_data->weights);

    proc_data->best_addr = (mca_btl_tcp_addr_t ***) malloc(perm_size
                                                * sizeof(mca_btl_tcp_addr_t **));
    assert(NULL != proc_data->best_addr);
    for(i = 0; i < perm_size; ++i) {
        proc_data->weights[i] = (enum mca_btl_tcp_connection_quality*) calloc(perm_size,
                                                                   sizeof(enum mca_btl_tcp_connection_quality));
        assert(NULL != proc_data->weights[i]);

        proc_data->best_addr[i] = (mca_btl_tcp_addr_t **) calloc(perm_size,
                                                      sizeof(mca_btl_tcp_addr_t *));
        assert(NULL != proc_data->best_addr[i]);
    }


    for( i = 0; i < proc_data->num_local_interfaces; ++i ) {
        mca_btl_tcp_interface_t* local_interface = proc_data->local_interfaces[i];
        for( j = 0; j < proc_data->num_peer_interfaces; ++j ) {

            /*  initially, assume no connection is possible */
            proc_data->weights[i][j] = CQ_NO_CONNECTION;

            /* check state of ipv4 address pair */
            if(NULL != proc_data->local_interfaces[i]->ipv4_address &&
               NULL != peer_interfaces[j]->ipv4_address) {

                if(opal_net_addr_isipv4public((struct sockaddr*) local_interface->ipv4_address) &&
                   opal_net_addr_isipv4public((struct sockaddr*) peer_interfaces[j]->ipv4_address)) {
                    if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
                                            (struct sockaddr*) peer_interfaces[j]->ipv4_address,
                                            local_interface->ipv4_netmask)) {
                        proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
                    } else {
                        proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
                    }
                    proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
                    continue;
                }
                if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address,
                                        (struct sockaddr*) peer_interfaces[j]->ipv4_address,
                                        local_interface->ipv4_netmask)) {
                    proc_data->weights[i][j] = CQ_PRIVATE_SAME_NETWORK;
                } else {
                    proc_data->weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK;
                }
                proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr;
                continue;
            }

            /* check state of ipv6 address pair - ipv6 is always public,
             * since link-local addresses are skipped in opal_ifinit()
             */
            if(NULL != local_interface->ipv6_address &&
               NULL != peer_interfaces[j]->ipv6_address) {

                if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv6_address,
                                         (struct sockaddr*) peer_interfaces[j]->ipv6_address,
                                         local_interface->ipv6_netmask)) {
                    proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK;
                } else {
                    proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK;
                }
                proc_data->best_addr[i][j] = peer_interfaces[j]->ipv6_endpoint_addr;
                continue;
            }

        } /* for each peer interface */
    } /* for each local interface */

    /*
     * determine the size of the set to permute (max number of
     * interfaces
     */

    proc_data->best_assignment = (unsigned int *) malloc (perm_size * sizeof(int));

    a = (int *) malloc(perm_size * sizeof(int));
    if (NULL == a) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* Can only find the best set of connections when the number of
     * interfaces is not too big.  When it gets larger, we fall back
     * to a simpler and faster (and not as optimal) algorithm.
     * See ticket https://svn.open-mpi.org/trac/ompi/ticket/2031
     * for more details about this issue.  */
    if (perm_size <= MAX_PERMUTATION_INTERFACES) {
        memset(a, 0, perm_size * sizeof(int));
        proc_data->max_assignment_cardinality = -1;
        proc_data->max_assignment_weight = -1;
        visit(proc_data, 0, -1, perm_size, a);

        rc = OPAL_ERR_UNREACH;
        for(i = 0; i < perm_size; ++i) {
            unsigned int best = proc_data->best_assignment[i];
            if(best > proc_data->num_peer_interfaces
               || proc_data->weights[i][best] == CQ_NO_CONNECTION
               || peer_interfaces[best]->inuse
               || NULL == peer_interfaces[best]) {
                continue;
            }
            peer_interfaces[best]->inuse++;
            btl_endpoint->endpoint_addr = proc_data->best_addr[i][best];
            btl_endpoint->endpoint_addr->addr_inuse++;
            rc = OPAL_SUCCESS;
            break;
        }
    } else {
        enum mca_btl_tcp_connection_quality max;
        int i_max = 0, j_max = 0;
        /* Find the best connection that is not in use.  Save away
         * the indices of the best location. */
        max = CQ_NO_CONNECTION;
        for(i=0; i<proc_data->num_local_interfaces; ++i) {
            for(j=0; j<proc_data->num_peer_interfaces; ++j) {
                if (!peer_interfaces[j]->inuse) {
                    if (proc_data->weights[i][j] > max) {
                        max = proc_data->weights[i][j];
                        i_max = i;
                        j_max = j;
                    }
                }
            }
        }
        /* Now see if there is a some type of connection available. */
        rc = OPAL_ERR_UNREACH;
        if (CQ_NO_CONNECTION != max) {
            peer_interfaces[j_max]->inuse++;
            btl_endpoint->endpoint_addr = proc_data->best_addr[i_max][j_max];
            btl_endpoint->endpoint_addr->addr_inuse++;
            rc = OPAL_SUCCESS;
        }
    }

    for(i = 0; i < perm_size; ++i) {
        free(proc_data->weights[i]);
        free(proc_data->best_addr[i]);
    }

    for(i = 0; i < proc_data->num_peer_interfaces; ++i) {
        if(NULL != peer_interfaces[i]->ipv4_address) {
            free(peer_interfaces[i]->ipv4_address);
        }
        if(NULL != peer_interfaces[i]->ipv6_address) {
            free(peer_interfaces[i]->ipv6_address);
        }
        free(peer_interfaces[i]);
    }
    free(peer_interfaces);

    for(i = 0; i < proc_data->num_local_interfaces; ++i) {
        if(NULL != proc_data->local_interfaces[i]->ipv4_address) {
            free(proc_data->local_interfaces[i]->ipv4_address);
        }
        if(NULL != proc_data->local_interfaces[i]->ipv6_address) {
            free(proc_data->local_interfaces[i]->ipv6_address);
        }
        free(proc_data->local_interfaces[i]);
    }
    free(proc_data->local_interfaces);
    proc_data->max_local_interfaces = 0;

    free(proc_data->weights);
    free(proc_data->best_addr);
    free(proc_data->best_assignment);
    free(a);

    return rc;
}