Ejemplo n.º 1
0
/*
 * A blocking recv on a non-blocking socket. Used to receive the small amount of connection
 * information that identifies the endpoints endpoint.
 */
static int mca_btl_tcp_endpoint_recv_blocking(mca_btl_base_endpoint_t* btl_endpoint, void* data, size_t size)
{
    unsigned char* ptr = (unsigned char*)data;
    size_t cnt = 0;
    while(cnt < size) {
        int retval = recv(btl_endpoint->endpoint_sd, (char *)ptr+cnt, size-cnt, 0);

        /* remote closed connection */
        if(retval == 0) {
            mca_btl_tcp_endpoint_close(btl_endpoint);
            return -1;
        }

        /* socket is non-blocking so handle errors */
        if(retval < 0) {
            if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
                BTL_ERROR(("recv(%d) failed: %s (%d)",
                           btl_endpoint->endpoint_sd, strerror(opal_socket_errno), opal_socket_errno));
                mca_btl_tcp_endpoint_close(btl_endpoint);
                return -1;
            }
            continue;
        }
        cnt += retval;
    }
    return cnt;
}
Ejemplo n.º 2
0
bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
{
    int cnt=-1;
    size_t i, num_vecs;

    /* non-blocking write, but continue if interrupted */
    while(cnt < 0) {
        cnt = writev(sd, frag->iov_ptr, frag->iov_cnt);
        if(cnt < 0) {
            switch(opal_socket_errno) {
            case EINTR:
                continue;
            case EWOULDBLOCK:
                return false;
            case EFAULT:
                BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n",
                    frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
                    strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
                frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
                mca_btl_tcp_endpoint_close(frag->endpoint);
                return false;
            default:
                BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)",
                           strerror(opal_socket_errno),
                           opal_socket_errno));
                frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
                mca_btl_tcp_endpoint_close(frag->endpoint);
                return false;
            }
        }
    }

    /* if the write didn't complete - update the iovec state */
    num_vecs = frag->iov_cnt;
    for(i=0; i<num_vecs; i++) {
        if(cnt >= (int)frag->iov_ptr->iov_len) {
            cnt -= frag->iov_ptr->iov_len;
            frag->iov_ptr++;
            frag->iov_idx++;
            frag->iov_cnt--;
        } else {
            frag->iov_ptr->iov_base = (opal_iov_base_ptr_t)
                (((unsigned char*)frag->iov_ptr->iov_base) + cnt);
            frag->iov_ptr->iov_len -= cnt;
            OPAL_OUTPUT_VERBOSE((100, opal_btl_base_framework.framework_output,
                                 "%s:%d write %d bytes on socket %d\n",
                                 __FILE__, __LINE__, cnt, sd));
            break;
        }
    }
    return (frag->iov_cnt == 0);
}
Ejemplo n.º 3
0
bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
{
    int cnt=-1;
    size_t i, num_vecs;

    /* non-blocking write, but continue if interrupted */
    while(cnt < 0) {
        cnt = writev(sd, frag->iov_ptr, frag->iov_cnt);
        if(cnt < 0) {
            switch(opal_socket_errno) {
            case EINTR:
                continue;
            case EWOULDBLOCK:
                return false;
            case EFAULT:
                BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %d)\n\t%s(%d)\n",
                    frag->iov_ptr[0].iov_base, frag->iov_ptr[0].iov_len,
                    strerror(opal_socket_errno), frag->iov_cnt));
                mca_btl_tcp_endpoint_close(frag->endpoint);
                return false;
            default:
                BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)", 
                           strerror(opal_socket_errno),
                           opal_socket_errno));
                mca_btl_tcp_endpoint_close(frag->endpoint);
                return false;
            }
        }
    }

    /* if the write didn't complete - update the iovec state */
    num_vecs = frag->iov_cnt;
    for(i=0; i<num_vecs; i++) {
        if(cnt >= (int)frag->iov_ptr->iov_len) {
            cnt -= frag->iov_ptr->iov_len;
            frag->iov_ptr++;
            frag->iov_idx++;
            frag->iov_cnt--;
        } else {
            frag->iov_ptr->iov_base = (ompi_iov_base_ptr_t)
                (((unsigned char*)frag->iov_ptr->iov_base) + cnt);
            frag->iov_ptr->iov_len -= cnt;
            break;
        }
    }
    return (frag->iov_cnt == 0);
}
Ejemplo n.º 4
0
/*
 * Destroy a endpoint
 *
 */
static void mca_btl_tcp_endpoint_destruct(mca_btl_tcp_endpoint_t* endpoint)
{
    mca_btl_tcp_proc_remove(endpoint->endpoint_proc, endpoint);
    mca_btl_tcp_endpoint_close(endpoint);
    OBJ_DESTRUCT(&endpoint->endpoint_frags);
    OBJ_DESTRUCT(&endpoint->endpoint_send_lock);
    OBJ_DESTRUCT(&endpoint->endpoint_recv_lock);
}
Ejemplo n.º 5
0
bool mca_btl_tcp_endpoint_accept(mca_btl_base_endpoint_t* btl_endpoint,
                                 struct sockaddr* addr, int sd)
{
    mca_btl_tcp_proc_t* this_proc = mca_btl_tcp_proc_local();
    mca_btl_tcp_proc_t *endpoint_proc = btl_endpoint->endpoint_proc;
    int cmpval;

    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock);
    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);

    if(NULL == btl_endpoint->endpoint_addr) {
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        return false;
    }

    cmpval = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, 
                                    &endpoint_proc->proc_ompi->proc_name,
                                    &this_proc->proc_ompi->proc_name);
    if((btl_endpoint->endpoint_sd < 0) ||
       (btl_endpoint->endpoint_state != MCA_BTL_TCP_CONNECTED &&
        cmpval < 0)) {
        mca_btl_tcp_endpoint_close(btl_endpoint);
        btl_endpoint->endpoint_sd = sd;
        if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) != OMPI_SUCCESS) {
            mca_btl_tcp_endpoint_close(btl_endpoint);
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
            return false;
        }
        mca_btl_tcp_endpoint_event_init(btl_endpoint);
        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
        mca_btl_tcp_endpoint_connected(btl_endpoint);
#if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP
        mca_btl_tcp_endpoint_dump(btl_endpoint, "accepted");
#endif
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        return true;
    }
    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
    return false;
}
Ejemplo n.º 6
0
/*
 * Check the status of the connection. If the connection failed, will retry
 * later. Otherwise, send this processes identifier to the endpoint on the 
 * newly connected socket.
 */
static void mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_endpoint)
{
    int so_error = 0;
    opal_socklen_t so_length = sizeof(so_error);
    struct sockaddr_storage endpoint_addr;

    mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr);

    /* unregister from receiving event notifications */
    opal_event_del(&btl_endpoint->endpoint_send_event);

    /* check connect completion status */
    if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_ERROR, (char *)&so_error, &so_length) < 0) {
        BTL_ERROR(("getsockopt() to %s failed: %s (%d)", 
                   opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
                   strerror(opal_socket_errno), opal_socket_errno));
        mca_btl_tcp_endpoint_close(btl_endpoint);
        return;
    }
    if(so_error == EINPROGRESS || so_error == EWOULDBLOCK) {
        opal_event_add(&btl_endpoint->endpoint_send_event, 0);
        return;
    }
    if(so_error != 0) {
        BTL_ERROR(("connect() to %s failed: %s (%d)", 
                   opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
                   strerror(so_error), so_error));
        mca_btl_tcp_endpoint_close(btl_endpoint);
        return;
    }

    if(mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint) == OMPI_SUCCESS) {
        btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
    } else {
        mca_btl_tcp_endpoint_close(btl_endpoint);
    }
}
Ejemplo n.º 7
0
/*
 * A blocking send on a non-blocking socket. Used to send the small amount of connection
 * information that identifies the endpoints endpoint.
 */
static int mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint, void* data, size_t size)
{
    unsigned char* ptr = (unsigned char*)data;
    size_t cnt = 0;
    while(cnt < size) {
        int retval = send(btl_endpoint->endpoint_sd, (const char *)ptr+cnt, size-cnt, 0);
        if(retval < 0) {
            if(opal_socket_errno != EINTR && opal_socket_errno != EAGAIN && opal_socket_errno != EWOULDBLOCK) {
                BTL_ERROR(("send() failed: %s (%d)",
                           strerror(opal_socket_errno), opal_socket_errno));
                mca_btl_tcp_endpoint_close(btl_endpoint);
                return -1;
            }
            continue;
        }
        cnt += retval;
    }
    return cnt;
}
Ejemplo n.º 8
0
/*
 *  Receive the endpoints globally unique process identification from a newly
 *  connected socket and verify the expected response. If so, move the
 *  socket to a connected state.
 */
static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_endpoint)
{
    orte_process_name_t guid;
    mca_btl_tcp_proc_t* btl_proc = btl_endpoint->endpoint_proc;

    if((mca_btl_tcp_endpoint_recv_blocking(btl_endpoint, &guid, sizeof(orte_process_name_t))) != sizeof(orte_process_name_t)) {
        return OMPI_ERR_UNREACH;
    }
    ORTE_PROCESS_NAME_NTOH(guid);
    /* compare this to the expected values */
    if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                                                    &btl_proc->proc_ompi->proc_name,
                                                    &guid)) {
        BTL_ERROR(("received unexpected process identifier %s", 
                   ORTE_NAME_PRINT(&guid)));
        mca_btl_tcp_endpoint_close(btl_endpoint);
        return OMPI_ERR_UNREACH;
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 9
0
bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
{
    mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;
    int i, num_vecs, dont_copy_data = 0;
    ssize_t cnt;

 repeat:
    num_vecs = frag->iov_cnt;
#if MCA_BTL_TCP_ENDPOINT_CACHE
    if( 0 != btl_endpoint->endpoint_cache_length ) {
        size_t length;
        /* It's strange at the first look but cnt have to be set to the full amount of data
         * available. After going to advance_iov_position we will use cnt to detect if there
         * is still some data pending.
         */
        cnt = length = btl_endpoint->endpoint_cache_length;
        for( i = 0; i < (int)frag->iov_cnt; i++ ) {
            if( length > frag->iov_ptr[i].iov_len )
                length = frag->iov_ptr[i].iov_len;
            if( (0 == dont_copy_data) || (length < frag->iov_ptr[i].iov_len) ) {
                memcpy( frag->iov_ptr[i].iov_base, btl_endpoint->endpoint_cache_pos, length );
            } else {
                frag->segments[0].seg_addr.pval = btl_endpoint->endpoint_cache_pos;
                frag->iov_ptr[i].iov_base = btl_endpoint->endpoint_cache_pos;
            }
            btl_endpoint->endpoint_cache_pos += length;
            btl_endpoint->endpoint_cache_length -= length;
            length = btl_endpoint->endpoint_cache_length;
            if( 0 == length ) {
                btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache;
                break;
            }
        }
        goto advance_iov_position;
    }
    /* What's happens if all iovecs are used by the fragment ? It still work, as we reserve one
     * iovec for the caching in the fragment structure (the +1).
     */
    frag->iov_ptr[num_vecs].iov_base = btl_endpoint->endpoint_cache_pos;
    frag->iov_ptr[num_vecs].iov_len  =
        mca_btl_tcp_component.tcp_endpoint_cache - btl_endpoint->endpoint_cache_length;
    num_vecs++;
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */

    /* non-blocking read, but continue if interrupted */
    cnt = -1;
    while( cnt < 0 ) {
        cnt = readv(sd, frag->iov_ptr, num_vecs);
	if( 0 < cnt ) goto advance_iov_position;
	if( cnt == 0 ) {
        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
	    mca_btl_tcp_endpoint_close(btl_endpoint);
	    return false;
	}
	switch(opal_socket_errno) {
	case EINTR:
	    continue;
	case EWOULDBLOCK:
	    return false;
	case EFAULT:
            BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, %lu)\n\t%s(%lu)\n",
                       frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
                       strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
        mca_btl_tcp_endpoint_close(btl_endpoint);
        return false;
    default:
            BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)",
                       strerror(opal_socket_errno),
                       opal_socket_errno));
            btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
        mca_btl_tcp_endpoint_close(btl_endpoint);
        return false;
    }
    }

 advance_iov_position:
    /* if the read didn't complete - update the iovec state */
    num_vecs = frag->iov_cnt;
    for( i = 0; i < num_vecs; i++ ) {
        if( cnt < (int)frag->iov_ptr->iov_len ) {
            frag->iov_ptr->iov_base = (opal_iov_base_ptr_t)
                (((unsigned char*)frag->iov_ptr->iov_base) + cnt);
            frag->iov_ptr->iov_len -= cnt;
            cnt = 0;
            break;
    }
    cnt -= frag->iov_ptr->iov_len;
    frag->iov_idx++;
    frag->iov_ptr++;
    frag->iov_cnt--;
    }
#if MCA_BTL_TCP_ENDPOINT_CACHE
    btl_endpoint->endpoint_cache_length = cnt;
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */

    /* read header */
    if(frag->iov_cnt == 0) {
        if (btl_endpoint->endpoint_nbo && frag->iov_idx == 1) MCA_BTL_TCP_HDR_NTOH(frag->hdr);
        switch(frag->hdr.type) {
        case MCA_BTL_TCP_HDR_TYPE_SEND:
            if(frag->iov_idx == 1 && frag->hdr.size) {
                frag->segments[0].seg_addr.pval = frag+1;
                frag->segments[0].seg_len = frag->hdr.size;
                frag->iov[1].iov_base = (IOVBASE_TYPE*)(frag->segments[0].seg_addr.pval);
                frag->iov[1].iov_len = frag->hdr.size;
                frag->iov_cnt++;
#ifndef __sparc
#if !MCA_BTL_TCP_SUPPORT_PROGRESS_THREAD
                /* The following cannot be done for sparc code
                 * because it causes alignment errors when accessing
                 * structures later on in the btl and pml code.
                 */
                dont_copy_data = 1;
#endif
#endif
                goto repeat;
            }
            break;
        case MCA_BTL_TCP_HDR_TYPE_PUT:
            if(frag->iov_idx == 1) {
                frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->segments;
                frag->iov[1].iov_len = frag->hdr.count * sizeof(mca_btl_base_segment_t);
                frag->iov_cnt++;
                goto repeat;
            } else if (frag->iov_idx == 2) {
                for( i = 0; i < frag->hdr.count; i++ ) {
                    frag->iov[i+2].iov_base = (IOVBASE_TYPE*)frag->segments[i].seg_addr.pval;
                    frag->iov[i+2].iov_len = frag->segments[i].seg_len;
                }
                frag->iov_cnt += frag->hdr.count;
                goto repeat;
            }
            break;
        case MCA_BTL_TCP_HDR_TYPE_GET:
        default:
            break;
        }
        return true;
    }
    return false;
}
Ejemplo n.º 10
0
static void mca_btl_tcp2_endpoint_recv_handler(int sd, short flags, void* user)
{
    mca_btl_base_endpoint_t* btl_endpoint = (mca_btl_base_endpoint_t *)user;

    /* Make sure we don't have a race between a thread that remove the
     * recv event, and one event already scheduled.
     */
    if( sd != btl_endpoint->endpoint_sd )
        return;

    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock);
    switch(btl_endpoint->endpoint_state) {
    case MCA_BTL_TCP_CONNECT_ACK:
        {
            int rc = OMPI_ERROR;
            rc = mca_btl_tcp2_endpoint_recv_connect_ack(btl_endpoint);
            if( OMPI_SUCCESS == rc ) {
                /* we are now connected. Start sending the data */
                OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);
                mca_btl_tcp2_endpoint_connected(btl_endpoint);
                OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
#if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP
                mca_btl_tcp2_endpoint_dump(btl_endpoint, "connected");
#endif
            }
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
            return;
        }
    case MCA_BTL_TCP_CONNECTED:
        {
            mca_btl_tcp2_frag_t* frag;

            frag = btl_endpoint->endpoint_recv_frag;

        data_still_pending_on_endpoint:
            if(NULL == frag) {

                if(mca_btl_tcp_module.super.btl_max_send_size >
                   mca_btl_tcp_module.super.btl_eager_limit) {
                    MCA_BTL_TCP_FRAG_ALLOC_MAX(frag);
                } else {
                    MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag);
                }

                if(NULL == frag) {
                    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
                    return;
                }
                MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint);
            }

            /* check for completion of non-blocking recv on the current fragment */
            if( mca_btl_tcp_frag_recv(frag, btl_endpoint->endpoint_sd) == false ) {
                btl_endpoint->endpoint_recv_frag = frag;
            } else {
                btl_endpoint->endpoint_recv_frag = NULL;

                TODO_MCA_BTL_TCP_RECV_TRIGGER_CB(frag);

#if MCA_BTL_TCP_ENDPOINT_CACHE
                if( 0 != btl_endpoint->endpoint_cache_length ) {
#if MCA_BTL_TCP_USES_PROGRESS_THREAD
                    /* Get a new fragment and try again */
                    frag = NULL;
#else
                    /* If the cache still contain some data we can reuse the same fragment
                     * until we flush it completly.
                     */
                    MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint);
#endif  /* MCA_BTL_TCP_USES_PROGRESS_THREAD */
                    goto data_still_pending_on_endpoint;
                }
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */

#if !MCA_BTL_TCP_USES_PROGRESS_THREAD
                MCA_BTL_TCP_FRAG_RETURN(frag);
#endif  /* MCA_BTL_TCP_USES_PROGRESS_THREAD */
            }
#if MCA_BTL_TCP_ENDPOINT_CACHE
            assert( 0 == btl_endpoint->endpoint_cache_length );
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
            break;
        }
    case MCA_BTL_TCP_CLOSED:
        /* This is a thread-safety issue. As multiple threads are allowed
         * to generate events (in the lib event) we endup with several
         * threads executing the receive callback, when we reach the end
         * of the MPI_Finalize. The first one will close the connections,
         * and all others will complain.
         */
        break;
    default:
        OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock);
        BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state));
        btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
        mca_btl_tcp_endpoint_close(btl_endpoint);
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        break;
    }
}
Ejemplo n.º 11
0
/*
 *  Start a connection to the endpoint. This will likely not complete,
 *  as the socket is set to non-blocking, so register for event
 *  notification of connect completion. On connection we send
 *  our globally unique process identifier to the endpoint and wait for
 *  the endpoints response.
 */
static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpoint)
{
    int rc,flags;
    struct sockaddr_storage endpoint_addr;
    /* By default consider a IPv4 connection */
    uint16_t af_family = AF_INET;
    opal_socklen_t addrlen = sizeof(struct sockaddr_in);
    
#if OPAL_WANT_IPV6
    if (AF_INET6 == btl_endpoint->endpoint_addr->addr_family) {
        af_family = AF_INET6;
        addrlen = sizeof (struct sockaddr_in6);
    }
#endif
    
    btl_endpoint->endpoint_sd = socket(af_family, SOCK_STREAM, 0);
    if (btl_endpoint->endpoint_sd < 0) {
        btl_endpoint->endpoint_retries++;
        return OMPI_ERR_UNREACH;
    }

    /* setup socket buffer sizes */
    mca_btl_tcp_set_socket_options(btl_endpoint->endpoint_sd);

    /* setup event callbacks */
    mca_btl_tcp_endpoint_event_init(btl_endpoint);

    /* setup the socket as non-blocking */
    if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
        BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", 
                   strerror(opal_socket_errno), opal_socket_errno));
    } else {
        flags |= O_NONBLOCK;
        if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0)
            BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", 
                       strerror(opal_socket_errno), opal_socket_errno));
    }

    /* start the connect - will likely fail with EINPROGRESS */
    mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr);

    opal_output_verbose(20, mca_btl_base_output, 
                        "btl: tcp: attempting to connect() to %s address %s on port %d",
                        ORTE_NAME_PRINT(&btl_endpoint->endpoint_proc->proc_ompi->proc_name),
                        opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
                        ntohs(btl_endpoint->endpoint_addr->addr_port));

    if(connect(btl_endpoint->endpoint_sd, (struct sockaddr*)&endpoint_addr, addrlen) < 0) {
        /* non-blocking so wait for completion */
        if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
            btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
            opal_event_add(&btl_endpoint->endpoint_send_event, 0);
            return OMPI_SUCCESS;
        }
        {
            char *address;
            address = opal_net_get_hostname((struct sockaddr*) &endpoint_addr);
            BTL_PEER_ERROR( btl_endpoint->endpoint_proc->proc_ompi,
                          ( "Unable to connect to the peer %s on port %d: %s\n",
                            address,
                           btl_endpoint->endpoint_addr->addr_port, strerror(opal_socket_errno) ) );
        }
        mca_btl_tcp_endpoint_close(btl_endpoint);
        btl_endpoint->endpoint_retries++;
        return OMPI_ERR_UNREACH;
    }

    /* send our globally unique process identifier to the endpoint */
    if((rc = mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint)) == OMPI_SUCCESS) {
        btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
    } else {
        mca_btl_tcp_endpoint_close(btl_endpoint);
    }
    return rc;
}