static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
{
    bool pval_clean = false;
    int qp;

    /* If the CPC has an endpoint_finalize function, call it */
    if (NULL != endpoint->endpoint_local_cpc->cbm_endpoint_finalize) {
        endpoint->endpoint_local_cpc->cbm_endpoint_finalize(endpoint);
    }

    /* Release CTS buffer */
    ompi_btl_openib_connect_base_free_cts(endpoint);

    /* Release memory resources */
    do {
        /* Make sure that mca_btl_openib_endpoint_connect_eager_rdma ()
         * was not in "connect" or "bad" flow (failed to allocate memory)
         * and changed the pointer back to NULL
         */
        if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL,
                    (void*)1)) {
            if ((void*)1 != endpoint->eager_rdma_local.base.pval &&
                    NULL != endpoint->eager_rdma_local.base.pval) {
                endpoint->endpoint_btl->super.btl_mpool->mpool_free(endpoint->endpoint_btl->super.btl_mpool,
                        endpoint->eager_rdma_local.base.pval,
                        (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
                pval_clean=true;
            }
        } else {
            pval_clean=true;
        }
    } while (!pval_clean);

    /* Close opened QPs if we have them*/
   for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
        MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[0]);
        MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[1]);
        OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[0]);
        OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[1]);

        MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
                &endpoint->qps[qp].no_wqe_pending_frags[0]);
        MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
                &endpoint->qps[qp].no_wqe_pending_frags[1]);
        OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[0]);
        OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[1]);


        if(--endpoint->qps[qp].qp->users != 0)
            continue;

        if(endpoint->qps[qp].qp->lcl_qp != NULL)
            if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp))
                BTL_ERROR(("Failed to destroy QP:%d\n", qp));

        free(endpoint->qps[qp].qp);
    }

    /* free the qps */
    free(endpoint->qps);
    endpoint->qps = NULL;

    free(endpoint->rem_info.rem_qps);
    free(endpoint->rem_info.rem_srqs);

    /* unregister xrc recv qp */
#if HAVE_XRC
    if (0 != endpoint->xrc_recv_qp_num) {
        if(ibv_unreg_xrc_rcv_qp(endpoint->endpoint_btl->device->xrc_domain,
                    endpoint->xrc_recv_qp_num)) {
            BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp_num));
        }
    }
#endif

    OBJ_DESTRUCT(&endpoint->endpoint_lock);
    /* Clean pending lists */
    MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags);
    OBJ_DESTRUCT(&endpoint->pending_lazy_frags);

    MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags);
    OBJ_DESTRUCT(&endpoint->pending_get_frags);

    MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags);
    OBJ_DESTRUCT(&endpoint->pending_put_frags);
}
Exemple #2
0
/*
 *  Start a connection to the endpoint. This will likely not complete,
 *  as the socket is set to non-blocking, so register for event
 *  notification of connect completion. On connection we send
 *  our globally unique process identifier to the endpoint and wait for
 *  the endpoints response.
 */
static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpoint)
{
    int rc,flags;
    struct sockaddr_storage endpoint_addr;
    /* By default consider a IPv4 connection */
    uint16_t af_family = AF_INET;
    opal_socklen_t addrlen = sizeof(struct sockaddr_in);
    
#if OPAL_WANT_IPV6
    if (AF_INET6 == btl_endpoint->endpoint_addr->addr_family) {
        af_family = AF_INET6;
        addrlen = sizeof (struct sockaddr_in6);
    }
#endif
    
    btl_endpoint->endpoint_sd = socket(af_family, SOCK_STREAM, 0);
    if (btl_endpoint->endpoint_sd < 0) {
        btl_endpoint->endpoint_retries++;
        return OMPI_ERR_UNREACH;
    }

    /* setup socket buffer sizes */
    mca_btl_tcp_set_socket_options(btl_endpoint->endpoint_sd);

    /* setup event callbacks */
    mca_btl_tcp_endpoint_event_init(btl_endpoint);

    /* setup the socket as non-blocking */
    if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
        BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", 
                   strerror(opal_socket_errno), opal_socket_errno));
    } else {
        flags |= O_NONBLOCK;
        if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0)
            BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", 
                       strerror(opal_socket_errno), opal_socket_errno));
    }

    /* start the connect - will likely fail with EINPROGRESS */
    mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr);

    opal_output_verbose(20, mca_btl_base_output, 
                        "btl: tcp: attempting to connect() to %s address %s on port %d",
                        ORTE_NAME_PRINT(&btl_endpoint->endpoint_proc->proc_ompi->proc_name),
                        opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
                        ntohs(btl_endpoint->endpoint_addr->addr_port));

    if(connect(btl_endpoint->endpoint_sd, (struct sockaddr*)&endpoint_addr, addrlen) < 0) {
        /* non-blocking so wait for completion */
        if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
            btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
            opal_event_add(&btl_endpoint->endpoint_send_event, 0);
            return OMPI_SUCCESS;
        }
        {
            char *address;
            address = opal_net_get_hostname((struct sockaddr*) &endpoint_addr);
            BTL_PEER_ERROR( btl_endpoint->endpoint_proc->proc_ompi,
                          ( "Unable to connect to the peer %s on port %d: %s\n",
                            address,
                           btl_endpoint->endpoint_addr->addr_port, strerror(opal_socket_errno) ) );
        }
        mca_btl_tcp_endpoint_close(btl_endpoint);
        btl_endpoint->endpoint_retries++;
        return OMPI_ERR_UNREACH;
    }

    /* send our globally unique process identifier to the endpoint */
    if((rc = mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint)) == OMPI_SUCCESS) {
        btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK;
        opal_event_add(&btl_endpoint->endpoint_recv_event, 0);
    } else {
        mca_btl_tcp_endpoint_close(btl_endpoint);
    }
    return rc;
}
Exemple #3
0
/* This Async event thread is handling all async event of
 * all btls/devices in openib component
 */
void* btl_openib_async_thread(void * async)
{
    int rc;
    int i;
    struct mca_btl_openib_async_poll devices_poll;

    if (OMPI_SUCCESS != btl_openib_async_poll_init(&devices_poll)) {
        BTL_ERROR(("Fatal error, stoping asynch event thread"));
        pthread_exit(&return_status);
    }

    while(1) {
        rc = poll(devices_poll.async_pollfd, devices_poll.active_poll_size, -1);
        if (rc < 0) {
            if (errno != EINTR) {
                BTL_ERROR(("Poll failed.  Fatal error, stoping asynch event thread"));
                pthread_exit(&return_status);
            } else {
                /* EINTR - we got interupt */
                continue;
            }
        }
        for(i = 0; i < devices_poll.active_poll_size; i++) {
            switch (devices_poll.async_pollfd[i].revents) {
                case 0:
                    /* no events */
                    break;
                case POLLIN:
#if defined(__SVR4) && defined(__sun)
                /*
                 * Need workaround for Solaris IB user verbs since
                 * "Poll on IB async fd returns POLLRDNORM revent even though it is masked out"
                 */
                case POLLIN | POLLRDNORM:
#endif
                    /* Processing our event */
                    if (0 == i) {
                        /* 0 poll we use for comunication with main thread */
                        if (OMPI_SUCCESS != btl_openib_async_commandh(&devices_poll)) {
                            free(devices_poll.async_pollfd);
                            BTL_ERROR(("Failed to process async thread process.  "
                                        "Fatal error, stoping asynch event thread"));
                            pthread_exit(&return_status);
                        }
                    } else {
                        /* We get device event */
                        if (btl_openib_async_deviceh(&devices_poll, i)) {
                            free(devices_poll.async_pollfd);
                            BTL_ERROR(("Failed to process async thread process.  "
                                        "Fatal error, stoping asynch event thread"));
                            pthread_exit(&return_status);
                        }
                    }
                    break;
                default:
                    /* Get event other than POLLIN
                     * this case should not never happend */
                    BTL_ERROR(("Got unexpected event %d.  "
                               "Fatal error, stoping asynch event thread",
                               devices_poll.async_pollfd[i].revents));
                    free(devices_poll.async_pollfd);
                    pthread_exit(&return_status);
            }
        }
    }
    return PTHREAD_CANCELED;
}
bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
{
    int cnt, dont_copy_data = 0;
    size_t i, num_vecs;
    mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint;

 repeat:
    num_vecs = frag->iov_cnt;
#if MCA_BTL_TCP_ENDPOINT_CACHE
    if( 0 != btl_endpoint->endpoint_cache_length ) {
        size_t length;
        /* It's strange at the first look but cnt have to be set to the full amount of data
         * available. After going to advance_iov_position we will use cnt to detect if there
         * is still some data pending.
         */
        cnt = length = btl_endpoint->endpoint_cache_length;
        for( i = 0; i < frag->iov_cnt; i++ ) {
            if( length > frag->iov_ptr[i].iov_len )
                length = frag->iov_ptr[i].iov_len;
            if( (0 == dont_copy_data) || (length < frag->iov_ptr[i].iov_len) ) {
                memcpy( frag->iov_ptr[i].iov_base, btl_endpoint->endpoint_cache_pos, length );
            } else {
                frag->segments[0].seg_addr.pval = btl_endpoint->endpoint_cache_pos;
                frag->iov_ptr[i].iov_base = btl_endpoint->endpoint_cache_pos;
            }
            btl_endpoint->endpoint_cache_pos += length;
            btl_endpoint->endpoint_cache_length -= length;
            length = btl_endpoint->endpoint_cache_length;
            if( 0 == length ) {
                btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache;
                break;
            }
        }
        goto advance_iov_position;
    }
    /* What's happens if all iovecs are used by the fragment ? It still work, as we reserve one
     * iovec for the caching in the fragment structure (the +1).
     */
    frag->iov_ptr[num_vecs].iov_base = btl_endpoint->endpoint_cache_pos;
    frag->iov_ptr[num_vecs].iov_len  = 
        mca_btl_tcp_component.tcp_endpoint_cache - btl_endpoint->endpoint_cache_length;
    num_vecs++;
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */

    /* non-blocking read, but continue if interrupted */
    cnt = -1;
    while( cnt < 0 ) {
        cnt = readv(sd, frag->iov_ptr, num_vecs);
	if( 0 < cnt ) goto advance_iov_position;
	if( cnt == 0 ) {
	    mca_btl_tcp_endpoint_close(btl_endpoint);
	    return false;
	}
	switch(opal_socket_errno) {
	case EINTR:
	    continue;
	case EWOULDBLOCK:
	    return false;
	case EFAULT:
            BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, %d)\n\t%s(%d)\n",
                       frag->iov_ptr[0].iov_base, frag->iov_ptr[0].iov_len,
                       strerror(opal_socket_errno), frag->iov_cnt));
	    mca_btl_tcp_endpoint_close(btl_endpoint);
	    return false;
	default:
            BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", 
                       strerror(opal_socket_errno),
                       opal_socket_errno));
	    mca_btl_tcp_endpoint_close(btl_endpoint);
	    return false;
	}
    };

 advance_iov_position:
    /* if the read didn't complete - update the iovec state */
    num_vecs = frag->iov_cnt;
    for( i = 0; i < num_vecs; i++ ) {
        if( cnt < (int)frag->iov_ptr->iov_len ) {
            frag->iov_ptr->iov_base = (ompi_iov_base_ptr_t)
                (((unsigned char*)frag->iov_ptr->iov_base) + cnt);
            frag->iov_ptr->iov_len -= cnt;
            cnt = 0;
            break;
	}
	cnt -= frag->iov_ptr->iov_len;
	frag->iov_idx++;
	frag->iov_ptr++;
	frag->iov_cnt--;
    }
#if MCA_BTL_TCP_ENDPOINT_CACHE
    btl_endpoint->endpoint_cache_length = cnt;
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */

    /* read header */
    if(frag->iov_cnt == 0) {
        if (btl_endpoint->endpoint_nbo && frag->iov_idx == 1) MCA_BTL_TCP_HDR_NTOH(frag->hdr);
        switch(frag->hdr.type) {
        case MCA_BTL_TCP_HDR_TYPE_SEND:
            if(frag->iov_idx == 1 && frag->hdr.size) {
                frag->segments[0].seg_addr.pval = frag+1;
                frag->segments[0].seg_len = frag->hdr.size;
                frag->iov[1].iov_base = (IOVBASE_TYPE*)(frag->segments[0].seg_addr.pval);
                frag->iov[1].iov_len = frag->hdr.size;
                frag->iov_cnt++;
#ifndef __sparc
                /* The following cannot be done for sparc code 
                 * because it causes alignment errors when accessing
                 * structures later on in the btl and pml code.
                 */
                dont_copy_data = 1;
#endif
                goto repeat;
            }
            break;
        case MCA_BTL_TCP_HDR_TYPE_PUT:
            if(frag->iov_idx == 1) {
                frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->segments;
                frag->iov[1].iov_len = frag->hdr.count * sizeof(mca_btl_base_segment_t);
                frag->iov_cnt++;
                goto repeat;
            } else if (frag->iov_idx == 2) {
                for( i = 0; i < frag->hdr.count; i++ ) {
                    frag->iov[i+2].iov_base = (IOVBASE_TYPE*)ompi_ptr_ltop(frag->segments[i].seg_addr.lval);
                    frag->iov[i+2].iov_len = frag->segments[i].seg_len;
                }
                frag->iov_cnt += frag->hdr.count;
                goto repeat;
            }
            break;
        case MCA_BTL_TCP_HDR_TYPE_GET:
        default:
            break;
        }
        return true;
    }
    return false;
}
static inline int
mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
{
    uint64_t datagram_id, data, proc_id;
    uint32_t remote_addr, remote_id;
    mca_btl_base_endpoint_t *ep;
    gni_post_state_t post_state;
    gni_ep_handle_t handle;
    gni_return_t grc;
    int count = 0, rc;

    /* check for datagram completion */
    OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);  /* TODO: may not need lock for this function */
    grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id);
    if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) {
        OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
        return 0;
    }

    data = datagram_id & ~(MCA_BTL_UGNI_DATAGRAM_MASK);

    BTL_VERBOSE(("datgram_id: %" PRIx64 ", mask: %" PRIx64, datagram_id, (uint64_t) (datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK)));

    if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_DIRECTED_ID) {
        ep = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&ugni_module->endpoints, data);
        handle = ep->smsg_ep_handle;
    } else {
        handle = ugni_module->wildcard_ep;
    }

    /* wait for the incoming datagram to complete (in case it isn't) */
    grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state,
                                  &remote_addr, &remote_id);
    OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
    if (GNI_RC_SUCCESS != grc) {
        BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc));
        return opal_common_rc_ugni_to_opal (grc);
    }

    /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
    if (handle == ugni_module->wildcard_ep) {
        proc_id = mca_btl_ugni_proc_name_to_id (ugni_module->wc_remote_attr.proc_name);

        BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64,
                     proc_id));

        OPAL_THREAD_LOCK(&ugni_module->endpoint_lock);
        rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep);
        OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock);

        /* check if the endpoint is known */
        if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == ep)) {
            struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);
            BTL_VERBOSE(("Got connection request from an unknown peer {jobid = 0x%x, vid = 0x%x}",
                         ugni_module->wc_remote_attr.proc_name.jobid, ugni_module->wc_remote_attr.proc_name.vpid));
            ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
            if (OPAL_UNLIKELY(NULL == ep)) {
                return rc;
            }
        }
    } else {
        BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));
    }

    /* should not have gotten a NULL endpoint */
    assert (NULL != ep);

    BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, "
                 "data = 0x%" PRIx64 ", ep = %p, remote id: %d", datagram_id, post_state,
                 data, (void *) ep, remote_id));

    /* NTH: TODO -- error handling */
    opal_mutex_lock (&ep->lock);
    if (handle != ugni_module->wildcard_ep) {
        /* directed post complete */
        ep->dg_posted = false;
    }

    (void) mca_btl_ugni_ep_connect_progress (ep);
    opal_mutex_unlock (&ep->lock);

    if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
        /*  process messages waiting in the endpoint's smsg mailbox */
        count = mca_btl_ugni_smsg_process (ep);
    }

    /* repost the wildcard datagram */
    if (handle == ugni_module->wildcard_ep) {
        mca_btl_ugni_wildcard_ep_post (ugni_module);
    }

    return count;
}
/* this function is called with endpoint->endpoint_lock held */
int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
        mca_btl_openib_send_frag_t *frag)
{
    mca_btl_openib_header_t *hdr = frag->hdr;
    mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
    int qp, ib_rc;
    int32_t cm_return;
    bool do_rdma = false;
    size_t eager_limit;

    if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
        des->order = frag->qp_idx;

    qp = des->order;

    if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS)
        return OMPI_ERR_RESOURCE_BUSY;

    eager_limit = mca_btl_openib_component.eager_limit +
        sizeof(mca_btl_openib_header_coalesced_t) +
        sizeof(mca_btl_openib_control_header_t);
    if(des->des_src->seg_len + frag->coalesced_length <= eager_limit &&
            (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) {
        /* High priority frag. Try to send over eager RDMA */
        if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS)
            do_rdma = true;
    }

    if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) {
        qp_put_wqe(endpoint, qp);
        return OMPI_ERR_RESOURCE_BUSY;
    }

    BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
    if(hdr->credits)
        hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;

    if(!do_rdma) {
        if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
            BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
        }
    } else {
        hdr->credits |= (qp << 11);
    }

    BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    /* cm_seen is only 8 bytes, but cm_return is 32 bytes */
    if(cm_return > 255) {
        hdr->cm_seen = 255;
        cm_return -= 255;
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    } else {
        hdr->cm_seen = cm_return;
    }

    qp_reset_signal_count(endpoint, qp);
    ib_rc = post_send(endpoint, frag, do_rdma, 1);

    if(!ib_rc)
        return OMPI_SUCCESS;

    if(endpoint->nbo)
        BTL_OPENIB_HEADER_NTOH(*hdr);

    if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
                BTL_OPENIB_CREDITS(hdr->credits));
    }

    qp_put_wqe(endpoint, qp);

    if(do_rdma) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
    } else {
        if(BTL_OPENIB_QP_TYPE_PP(qp)) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
                    hdr->credits);
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
        } else if BTL_OPENIB_QP_TYPE_SRQ(qp){
            mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
            OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
        }
    }
    BTL_ERROR(("error posting send request error %d: %s\n",
               ib_rc, strerror(ib_rc)));
    return OMPI_ERROR;
}
Exemple #7
0
mca_btl_openib_proc_t* mca_btl_openib_proc_create(opal_proc_t* proc)
{
    mca_btl_openib_proc_t* module_proc = NULL;
    size_t msg_size;
    uint32_t size;
    int rc, i, j;
    void *message;
    char *offset;
    int modex_message_size;
    mca_btl_openib_modex_message_t dummy;

    /* Check if we have already created a IB proc
     * structure for this ompi process */
    module_proc = mca_btl_openib_proc_lookup_proc(proc);
    if (NULL != module_proc) {
        /* Gotcha! */
        return module_proc;
    }

    /* Oops! First time, gotta create a new IB proc
     * out of the opal_proc ... */
    module_proc = OBJ_NEW(mca_btl_openib_proc_t);
    /* Initialize number of peer */
    module_proc->proc_endpoint_count = 0;
    module_proc->proc_opal = proc;

    /* query for the peer address info */
    OPAL_MODEX_RECV(rc, &mca_btl_openib_component.super.btl_version,
                    proc, &message, &msg_size);
    if (OPAL_SUCCESS != rc) {
        BTL_ERROR(("[%s:%d] opal_modex_recv failed for peer %s",
                   __FILE__, __LINE__,
                   OPAL_NAME_PRINT(proc->proc_name)));
        OBJ_RELEASE(module_proc);
        return NULL;
    }
    if (0 == msg_size) {
        return NULL;
    }

    /* Message was packed in btl_openib_component.c; the format is
       listed in a comment in that file */
    modex_message_size = ((char *) &(dummy.end)) - ((char*) &dummy);

    /* Unpack the number of modules in the message */
    offset = (char *) message;
    unpack8(&offset, &(module_proc->proc_port_count));
    BTL_VERBOSE(("unpack: %d btls", module_proc->proc_port_count));
    if (module_proc->proc_port_count > 0) {
        module_proc->proc_ports = (mca_btl_openib_proc_modex_t *)
            malloc(sizeof(mca_btl_openib_proc_modex_t) *
                   module_proc->proc_port_count);
    } else {
        module_proc->proc_ports = NULL;
    }

    /* Loop over unpacking all the ports */
    for (i = 0; i < module_proc->proc_port_count; i++) {

        /* Unpack the modex comment message struct */
        size = modex_message_size;
        memcpy(&(module_proc->proc_ports[i].pm_port_info), offset, size);
#if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
        MCA_BTL_OPENIB_MODEX_MSG_NTOH(module_proc->proc_ports[i].pm_port_info);
#endif
        offset += size;
        BTL_VERBOSE(("unpacked btl %d: modex message, offset now %d",
                     i, (int)(offset-((char*)message))));

        /* Unpack the number of CPCs that follow */
        unpack8(&offset, &(module_proc->proc_ports[i].pm_cpc_data_count));
        BTL_VERBOSE(("unpacked btl %d: number of cpcs to follow %d (offset now %d)",
                     i, module_proc->proc_ports[i].pm_cpc_data_count,
                     (int)(offset-((char*)message))));
        module_proc->proc_ports[i].pm_cpc_data = (opal_btl_openib_connect_base_module_data_t *)
            calloc(module_proc->proc_ports[i].pm_cpc_data_count,
                   sizeof(opal_btl_openib_connect_base_module_data_t));
        if (NULL == module_proc->proc_ports[i].pm_cpc_data) {
            return NULL;
        }

        /* Unpack the CPCs */
        for (j = 0; j < module_proc->proc_ports[i].pm_cpc_data_count; ++j) {
            uint8_t u8;
            opal_btl_openib_connect_base_module_data_t *cpcd;
            cpcd = module_proc->proc_ports[i].pm_cpc_data + j;
            unpack8(&offset, &u8);
            BTL_VERBOSE(("unpacked btl %d: cpc %d: index %d (offset now %d)",
                         i, j, u8, (int)(offset-(char*)message)));
            cpcd->cbm_component =
                opal_btl_openib_connect_base_get_cpc_byindex(u8);
            BTL_VERBOSE(("unpacked btl %d: cpc %d: component %s",
                         i, j, cpcd->cbm_component->cbc_name));

            unpack8(&offset, &cpcd->cbm_priority);
            unpack8(&offset, &cpcd->cbm_modex_message_len);
            BTL_VERBOSE(("unpacked btl %d: cpc %d: priority %d, msg len %d (offset now %d)",
                         i, j, cpcd->cbm_priority,
                         cpcd->cbm_modex_message_len,
                         (int)(offset-(char*)message)));
            if (cpcd->cbm_modex_message_len > 0) {
                cpcd->cbm_modex_message = malloc(cpcd->cbm_modex_message_len);
                if (NULL == cpcd->cbm_modex_message) {
                    BTL_ERROR(("Failed to malloc"));
                    return NULL;
                }
                memcpy(cpcd->cbm_modex_message, offset,
                       cpcd->cbm_modex_message_len);
                offset += cpcd->cbm_modex_message_len;
                BTL_VERBOSE(("unpacked btl %d: cpc %d: blob unpacked %d %x (offset now %d)",
                             i, j,
                             ((uint32_t*)cpcd->cbm_modex_message)[0],
                             ((uint32_t*)cpcd->cbm_modex_message)[1],
                             (int)(offset-((char*)message))));
            }
        }
    }

    if (0 == module_proc->proc_port_count) {
        module_proc->proc_endpoints = NULL;
    } else {
        module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
            malloc(module_proc->proc_port_count *
                   sizeof(mca_btl_base_endpoint_t*));
    }
    if (NULL == module_proc->proc_endpoints) {
        OBJ_RELEASE(module_proc);
        return NULL;
    }

    BTL_VERBOSE(("unpacking done!"));
    return module_proc;
}
Exemple #8
0
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) {

    int ret = 0;
    int events_read;
    int events = 0;
    struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE];
    struct fi_cq_err_entry cqerr = {0};

    mca_btl_ofi_completion_context_t *c_ctx;
    mca_btl_ofi_base_completion_t *comp;
    mca_btl_ofi_rdma_completion_t *rdma_comp;
    mca_btl_ofi_frag_completion_t *frag_comp;

    ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read);

    if (0 < ret) {
        events_read = ret;
        for (int i = 0; i < events_read; i++) {
            if (NULL != cq_entry[i].op_context) {
                ++events;

                c_ctx = (mca_btl_ofi_completion_context_t*) cq_entry[i].op_context;

                /* We are casting to every type  here just for simplicity. */
                comp = (mca_btl_ofi_base_completion_t*) c_ctx->comp;
                frag_comp = (mca_btl_ofi_frag_completion_t*) c_ctx->comp;
                rdma_comp = (mca_btl_ofi_rdma_completion_t*) c_ctx->comp;

                switch (comp->type) {
                case MCA_BTL_OFI_TYPE_GET:
                case MCA_BTL_OFI_TYPE_PUT:
                case MCA_BTL_OFI_TYPE_AOP:
                case MCA_BTL_OFI_TYPE_AFOP:
                case MCA_BTL_OFI_TYPE_CSWAP:
                    /* call the callback */
                    if (rdma_comp->cbfunc) {
                        rdma_comp->cbfunc (comp->btl, comp->endpoint,
                                           rdma_comp->local_address, rdma_comp->local_handle,
                                           rdma_comp->cbcontext, rdma_comp->cbdata, OPAL_SUCCESS);
                    }

                    MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t*) comp->btl);
                    break;

                case MCA_BTL_OFI_TYPE_RECV:
                    mca_btl_ofi_recv_frag((mca_btl_ofi_module_t*)  comp->btl,
                                          (mca_btl_ofi_endpoint_t*) comp->endpoint,
                                          context, frag_comp->frag);
                    break;

                case MCA_BTL_OFI_TYPE_SEND:
                    MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t*) comp->btl);
                    mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS);
                    break;

                default:
                    /* catasthrophic */
                    BTL_ERROR(("unknown completion type"));
                    MCA_BTL_OFI_ABORT();
                }

                /* return the completion handler */
                opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp);
            }
        }
    } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) {
        ret = fi_cq_readerr(context->cq, &cqerr, 0);

        /* cq readerr failed!? */
        if (0 > ret) {
            BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)",
                       __FILE__, __LINE__, fi_strerror(-ret), ret));
        } else {
            BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n",
                       cqerr.prov_errno));
        }
        MCA_BTL_OFI_ABORT();
    }
#ifdef FI_EINTR
    /* sometimes, sockets provider complain about interupt. We do nothing. */
    else if (OPAL_UNLIKELY(ret == -FI_EINTR)) {

    }
#endif
    /* If the error is not FI_EAGAIN, report the error and abort. */
    else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) {
        BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret)));
        MCA_BTL_OFI_ABORT();
    }

    return events;
}
static int init_ud_qp(struct ibv_context *context_arg,
                      struct mca_btl_openib_sa_qp_cache *cache)
{
    struct ibv_qp_init_attr iattr;
    struct ibv_qp_attr mattr;
    int rc;

    /* create cq */
    cache->cq = ibv_create_cq(cache->context, 4, NULL, NULL, 0);
    if (NULL == cache->cq) {
        BTL_ERROR(("error creating cq, errno says %s", strerror(errno)));
        opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q",
                true, opal_process_info.nodename,
                __FILE__, __LINE__, "ibv_create_cq",
                strerror(errno), errno,
                ibv_get_device_name(context_arg->device));
        return OPAL_ERROR;
    }

    /* create qp */
    memset(&iattr, 0, sizeof(iattr));
    iattr.send_cq = cache->cq;
    iattr.recv_cq = cache->cq;
    iattr.cap.max_send_wr = 1;
    iattr.cap.max_recv_wr = 1;
    iattr.cap.max_send_sge = 1;
    iattr.cap.max_recv_sge = 1;
    iattr.qp_type = IBV_QPT_UD;
    cache->qp = ibv_create_qp(cache->pd, &iattr);
    if (NULL == cache->qp) {
        BTL_ERROR(("error creating qp %s (%d)", strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_INIT */
    memset(&mattr, 0, sizeof(mattr));
    mattr.qp_state = IBV_QPS_INIT;
    mattr.port_num = cache->port_num;
    mattr.qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY);
    rc = ibv_modify_qp(cache->qp, &mattr,
            IBV_QP_STATE              |
            IBV_QP_PKEY_INDEX         |
            IBV_QP_PORT               |
            IBV_QP_QKEY);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_RTR */
    memset(&mattr, 0, sizeof(mattr));
    mattr.qp_state = IBV_QPS_RTR;
    rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_RTS */
    mattr.qp_state = IBV_QPS_RTS;
    rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE | IBV_QP_SQ_PSN);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    return OPAL_SUCCESS;
}
Exemple #10
0
int mca_btl_tcp_component_open(void)
{
    char* message;
#ifdef __WINDOWS__
    WSADATA win_sock_data;
    if( WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0 ) {
        BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError()));
        return OMPI_ERROR;
    }
#endif

    /* initialize state */
    mca_btl_tcp_component.tcp_listen_sd = -1;
#if OPAL_WANT_IPV6
    mca_btl_tcp_component.tcp6_listen_sd = -1;
#endif
    mca_btl_tcp_component.tcp_num_btls=0;
    mca_btl_tcp_component.tcp_addr_count = 0;
    mca_btl_tcp_component.tcp_btls=NULL;

    /* initialize objects */
    OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t);
    OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t);
    OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t);
    OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t);
    OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t);
    OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t);
    opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256);

    /* register TCP component parameters */
    mca_btl_tcp_component.tcp_num_links =
        mca_btl_tcp_param_register_int("links", NULL, 1);
    mca_btl_tcp_component.tcp_if_include =
        mca_btl_tcp_param_register_string("if_include", NULL, "");
    mca_btl_tcp_component.tcp_if_exclude =
        mca_btl_tcp_param_register_string("if_exclude", NULL, "lo");
    mca_btl_tcp_component.tcp_free_list_num =
        mca_btl_tcp_param_register_int ("free_list_num", NULL, 8);
    mca_btl_tcp_component.tcp_free_list_max =
        mca_btl_tcp_param_register_int ("free_list_max", NULL, -1);
    mca_btl_tcp_component.tcp_free_list_inc =
        mca_btl_tcp_param_register_int ("free_list_inc", NULL, 32);
    mca_btl_tcp_component.tcp_sndbuf =
        mca_btl_tcp_param_register_int ("sndbuf", NULL, 128*1024);
    mca_btl_tcp_component.tcp_rcvbuf =
        mca_btl_tcp_param_register_int ("rcvbuf", NULL, 128*1024);
    mca_btl_tcp_component.tcp_endpoint_cache =
        mca_btl_tcp_param_register_int ("endpoint_cache",
                                        "The size of the internal cache for each TCP connection. This cache is"
                                        " used to reduce the number of syscalls, by replacing them with memcpy."
                                        " Every read will read the expected data plus the amount of the"
                                        " endpoint_cache", 30*1024);
    mca_btl_tcp_component.tcp_use_nodelay =
        !mca_btl_tcp_param_register_int ("use_nagle", "Whether to use Nagle's algorithm or not (using Nagle's algorithm may increase short message latency)", 0);
    mca_btl_tcp_component.tcp_port_min =
        mca_btl_tcp_param_register_int( "port_min_v4",
                                        "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 );
    if( mca_btl_tcp_component.tcp_port_min > USHRT_MAX ) {
        orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port",
                       true, "v4", orte_process_info.nodename,
                       mca_btl_tcp_component.tcp_port_min );
        mca_btl_tcp_component.tcp_port_min = 1024;
    }
    asprintf( &message,
              "The number of ports where the TCP BTL will try to bind (default %d)."
              " This parameter together with the port min, define a range of ports"
              " where Open MPI will open sockets.",
              (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1 );
    mca_btl_tcp_component.tcp_port_range =
        mca_btl_tcp_param_register_int( "port_range_v4", message,
                                        (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1);
    free(message);
#if OPAL_WANT_IPV6
    mca_btl_tcp_component.tcp6_port_min =
        mca_btl_tcp_param_register_int( "port_min_v6",
                                        "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 );
    if( mca_btl_tcp_component.tcp6_port_min > USHRT_MAX ) {
        orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port",
                       true, "v6", orte_process_info.nodename,
                       mca_btl_tcp_component.tcp6_port_min );
        mca_btl_tcp_component.tcp6_port_min = 1024;
    }
    asprintf( &message,
              "The number of ports where the TCP BTL will try to bind (default %d)."
              " This parameter together with the port min, define a range of ports"
              " where Open MPI will open sockets.",
              (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1 );
    mca_btl_tcp_component.tcp6_port_range =
        mca_btl_tcp_param_register_int( "port_range_v6", message,
                                        (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1);
    free(message);
#endif
    mca_btl_tcp_module.super.btl_exclusivity =  MCA_BTL_EXCLUSIVITY_LOW + 100;
    mca_btl_tcp_module.super.btl_eager_limit = 64*1024;
    mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024;
    mca_btl_tcp_module.super.btl_max_send_size = 128*1024;
    mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024;
    mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX;
    mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0;
    mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT |
                                         MCA_BTL_FLAGS_SEND_INPLACE |
                                         MCA_BTL_FLAGS_NEED_CSUM |
                                         MCA_BTL_FLAGS_NEED_ACK |
                                         MCA_BTL_FLAGS_HETEROGENEOUS_RDMA;
    mca_btl_tcp_module.super.btl_bandwidth = 100;
    mca_btl_tcp_module.super.btl_latency = 100;
    mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version,
                                &mca_btl_tcp_module.super);

    mca_btl_tcp_component.tcp_disable_family =
        mca_btl_tcp_param_register_int ("disable_family", NULL, 0);

    return OMPI_SUCCESS;
}
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
{
    mca_btl_tcp_proc_t* btl_proc;
    size_t size;
    int rc;

    OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
    rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
                                   proc->proc_name, (void**)&btl_proc);
    if(OPAL_SUCCESS == rc) {
        OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
        return btl_proc;
    }

    do {
        btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
        if(NULL == btl_proc) {
            rc = OPAL_ERR_OUT_OF_RESOURCE;
            break;
        }

        btl_proc->proc_opal = proc;

        OBJ_RETAIN(btl_proc->proc_opal);

        /* lookup tcp parameters exported by this proc */
        OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
                        &proc->proc_name, (uint8_t**)&btl_proc->proc_addrs, &size);
        if(rc != OPAL_SUCCESS) {
            if(OPAL_ERR_NOT_FOUND != rc)
                BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc));
            break;
        }

        if(0 != (size % sizeof(mca_btl_tcp_addr_t))) {
            BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n",
                       (unsigned long) size, (unsigned long)sizeof(mca_btl_tcp_addr_t)));
            rc = OPAL_ERROR;
            break;
        }

        btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t);

        /* allocate space for endpoint array - one for each exported address */
        btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
            malloc((1 + btl_proc->proc_addr_count) *
                   sizeof(mca_btl_base_endpoint_t*));
        if(NULL == btl_proc->proc_endpoints) {
            rc = OPAL_ERR_OUT_OF_RESOURCE;
            break;
        }

        if(NULL == mca_btl_tcp_component.tcp_local && (proc == opal_proc_local_get())) {
            mca_btl_tcp_component.tcp_local = btl_proc;
        }

        /* convert the OPAL addr_family field to OS constants,
         * so we can check for AF_INET (or AF_INET6) and don't have
         * to deal with byte ordering anymore.
         */
        for (unsigned int i = 0; i < btl_proc->proc_addr_count; i++) {
            if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) {
                btl_proc->proc_addrs[i].addr_family = AF_INET;
            }
#if OPAL_ENABLE_IPV6
            if (MCA_BTL_TCP_AF_INET6 == btl_proc->proc_addrs[i].addr_family) {
                btl_proc->proc_addrs[i].addr_family = AF_INET6;
            }
#endif
        }
    } while (0);

    if (OPAL_SUCCESS == rc) {
        /* add to hash table of all proc instance. */
        opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
                                  proc->proc_name, btl_proc);
    } else {
        if (btl_proc) {
            OBJ_RELEASE(btl_proc);
            btl_proc = NULL;
        }
    }

    OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);

    return btl_proc;
}
Exemple #12
0
/* Function handle async thread commands */
static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *devices_poll, opal_list_t *ignore_qp_err_list)
{
    struct pollfd *async_pollfd_tmp;
    mca_btl_openib_async_cmd_t cmd;
    int fd,flags,j,ret;
    /* Got command from main thread */
    ret = read(devices_poll->async_pollfd[0].fd, &cmd, sizeof(mca_btl_openib_async_cmd_t));
    if (sizeof(mca_btl_openib_async_cmd_t) != ret) {
        BTL_ERROR(("Read failed [%d]",errno));
        return OPAL_ERROR;
    }

    BTL_VERBOSE(("Got cmd %d", cmd.a_cmd));
    if (OPENIB_ASYNC_CMD_FD_ADD == cmd.a_cmd) {
        fd = cmd.fd;
        BTL_VERBOSE(("Got fd %d", fd));
        BTL_VERBOSE(("Adding device [%d] to async event poll[%d]",
                     fd, devices_poll->active_poll_size));
        flags = fcntl(fd, F_GETFL);
        if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) {
            BTL_ERROR(("Failed to change file descriptor of async event"));
            return OPAL_ERROR;
        }
        if ((devices_poll->active_poll_size + 1) > devices_poll->poll_size) {
            devices_poll->poll_size+=devices_poll->poll_size;
            async_pollfd_tmp = malloc(sizeof(struct pollfd) * devices_poll->poll_size);
            if (NULL == async_pollfd_tmp) {
                BTL_ERROR(("Failed malloc: %s:%d.  "
                            "Fatal error, stoping asynch event thread"
                            , __FILE__, __LINE__));
                return OPAL_ERROR;
            }
            memcpy (async_pollfd_tmp,devices_poll->async_pollfd,
                    sizeof(struct pollfd) * (devices_poll->active_poll_size));
            free(devices_poll->async_pollfd);
            devices_poll->async_pollfd = async_pollfd_tmp;
        }
        devices_poll->async_pollfd[devices_poll->active_poll_size].fd = fd;
        devices_poll->async_pollfd[devices_poll->active_poll_size].events = POLLIN;
        devices_poll->async_pollfd[devices_poll->active_poll_size].revents = 0;
        devices_poll->active_poll_size++;
        if (OPAL_SUCCESS != send_command_comp(fd)) {
            return OPAL_ERROR;
        }
    } else if (OPENIB_ASYNC_CMD_FD_REMOVE == cmd.a_cmd) {
        bool fd_found = false;

        fd = cmd.fd;
        BTL_VERBOSE(("Got fd %d", fd));

        /* Removing device from poll */
        BTL_VERBOSE(("Removing device [%d] from async event poll [%d]",
                     fd, devices_poll->active_poll_size));
        if (devices_poll->active_poll_size > 1) {
            for (j=0; (j < devices_poll->active_poll_size || !fd_found); j++) {
                if (devices_poll->async_pollfd[j].fd == fd) {
                    devices_poll->async_pollfd[j].fd =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].fd;
                    devices_poll->async_pollfd[j].events =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].events;
                    devices_poll->async_pollfd[j].revents =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].revents;
                    fd_found = true;
                }
            }
            if (!fd_found) {
                BTL_ERROR(("Requested FD[%d] was not found in poll array",fd));
                return OPAL_ERROR;
            }
        }
        devices_poll->active_poll_size--;
        if (OPAL_SUCCESS != send_command_comp(fd)) {
            return OPAL_ERROR;
        }
    } else if (OPENIB_ASYNC_IGNORE_QP_ERR == cmd.a_cmd) {
        mca_btl_openib_qp_list *new_qp;
        new_qp = OBJ_NEW(mca_btl_openib_qp_list);
        BTL_VERBOSE(("Ignore errors on QP %p", (void *)cmd.qp));
        new_qp->qp = cmd.qp;
        opal_list_append(ignore_qp_err_list, (opal_list_item_t *)new_qp);
        send_command_comp(OPENIB_ASYNC_IGNORE_QP_ERR);

    } else if (OPENIB_ASYNC_THREAD_EXIT == cmd.a_cmd) {
        /* Got 0 - command to close the thread */
        opal_list_item_t *item;
        BTL_VERBOSE(("Async event thread exit"));
        free(devices_poll->async_pollfd);
        return_status = OPAL_SUCCESS;

        while ((item = opal_list_remove_first(ignore_qp_err_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(ignore_qp_err_list);

        pthread_exit(&return_status);
    }
    return OPAL_SUCCESS;
}
Exemple #13
0
int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device)
{
    mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules;
    mca_btl_base_endpoint_t *ep;
    gni_ep_handle_t handle;
    int count = 0, rc;

    rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep);
    if (1 != rc) {
        return rc;
    }

    BTL_VERBOSE(("remote datagram completion on handle %p", (void*)handle));

    /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */
    if (handle == ugni_module->wildcard_ep) {
        struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name);

        BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s",
                     OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));

        ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc);
        if (OPAL_UNLIKELY(NULL == ep)) {
            /* there is no way to recover from this error so just abort() */
            BTL_ERROR(("could not find/allocate a btl endpoint for peer %s",
                       OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name)));
            abort ();
            return OPAL_ERR_NOT_FOUND;
        }
    }

    /* should not have gotten a NULL endpoint */
    assert (NULL != ep);

    BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep));

    /* NTH: TODO -- error handling */
    opal_mutex_lock (&ep->lock);
    if (handle != ugni_module->wildcard_ep) {
        /* directed post complete */
        BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep));

        ep->dg_posted = false;
        (void) opal_atomic_add_32 (&ugni_module->active_datagrams, -1);
    }

    (void) mca_btl_ugni_ep_connect_progress (ep);
    opal_mutex_unlock (&ep->lock);

    if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
        /*  process messages waiting in the endpoint's smsg mailbox */
        count = mca_btl_ugni_smsg_process (ep);
    }

    /* repost the wildcard datagram */
    if (handle == ugni_module->wildcard_ep) {
        mca_btl_ugni_wildcard_ep_post (ugni_module);
    }

    return count;
}
Exemple #14
0
static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
{
    mca_btl_base_endpoint_t* btl_endpoint = (mca_btl_base_endpoint_t *)user;

    /* Make sure we don't have a race between a thread that remove the
     * recv event, and one event already scheduled.
     */
    if( sd != btl_endpoint->endpoint_sd )
        return;

    OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock);
    switch(btl_endpoint->endpoint_state) {
    case MCA_BTL_TCP_CONNECT_ACK:
        {
            int rc = OMPI_ERROR;
            rc = mca_btl_tcp_endpoint_recv_connect_ack(btl_endpoint);
            if( OMPI_SUCCESS == rc ) {
                /* we are now connected. Start sending the data */
                OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);
                mca_btl_tcp_endpoint_connected(btl_endpoint);
                OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
#if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP
                mca_btl_tcp_endpoint_dump(btl_endpoint, "connected");
#endif
            }
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
            return;
        }
    case MCA_BTL_TCP_CONNECTED:
        {
            mca_btl_tcp_frag_t* frag;

            frag = btl_endpoint->endpoint_recv_frag;
            if(NULL == frag) {
                int rc;
                if(mca_btl_tcp_module.super.btl_max_send_size > 
                   mca_btl_tcp_module.super.btl_eager_limit) { 
                    MCA_BTL_TCP_FRAG_ALLOC_MAX(frag, rc);
                } else { 
                    MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag, rc);
                }
                
                if(NULL == frag) {
                    OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
                    return;
                }
                MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint);
            }

#if MCA_BTL_TCP_ENDPOINT_CACHE
            assert( 0 == btl_endpoint->endpoint_cache_length );
        data_still_pending_on_endpoint:
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
            /* check for completion of non-blocking recv on the current fragment */
            if(mca_btl_tcp_frag_recv(frag, btl_endpoint->endpoint_sd) == false) {
                btl_endpoint->endpoint_recv_frag = frag;
            } else {
                btl_endpoint->endpoint_recv_frag = NULL;
                if( MCA_BTL_TCP_HDR_TYPE_SEND == frag->hdr.type ) {
                    mca_btl_active_message_callback_t* reg;
                    reg = mca_btl_base_active_message_trigger + frag->hdr.base.tag;
                    reg->cbfunc(&frag->btl->super, frag->hdr.base.tag, &frag->base, reg->cbdata);
                }
#if MCA_BTL_TCP_ENDPOINT_CACHE
                if( 0 != btl_endpoint->endpoint_cache_length ) {
                    /* If the cache still contain some data we can reuse the same fragment
                     * until we flush it completly.
                     */
                    MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint);
                    goto data_still_pending_on_endpoint;
                }
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
                MCA_BTL_TCP_FRAG_RETURN(frag);
            }
#if MCA_BTL_TCP_ENDPOINT_CACHE
            assert( 0 == btl_endpoint->endpoint_cache_length );
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
            OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
            break;
        }
    case MCA_BTL_TCP_CLOSED:
        /* This is a thread-safety issue. As multiple threads are allowed
         * to generate events (in the lib event) we endup with several
         * threads executing the receive callback, when we reach the end
         * of the MPI_Finalize. The first one will close the connections,
         * and all others will complain.
         */
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        break;
    default:
        OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
        BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state));
        mca_btl_tcp_endpoint_close(btl_endpoint);
        break;
    }
}
/*
 * called when the connect module has completed setup of an endpoint
 */
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint)
{
    opal_list_item_t *frag_item, *ep_item;
    mca_btl_openib_send_frag_t *frag;
    mca_btl_openib_endpoint_t *ep;
    bool master = false;

    opal_output(-1, "Now we are CONNECTED");
    if (MCA_BTL_XRC_ENABLED) {
        OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock);
        if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) {
            /* We are not xrc master */
            /* set our qp pointer to master qp */
            master = false;
        } else {
            /* I'm master of XRC */
            endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTED;
            master = true;
        }
    }

    /* Run over all qps and load alternative path */
#if OPAL_HAVE_THREADS
    if (APM_ENABLED) {
        int i;
        if (MCA_BTL_XRC_ENABLED) {
            if (master) {
                mca_btl_openib_load_apm(endpoint->ib_addr->qp->lcl_qp, endpoint);
            }
        } else {
            for(i = 0; i < mca_btl_openib_component.num_qps; i++) {
                mca_btl_openib_load_apm(endpoint->qps[i].qp->lcl_qp, endpoint);
            }
        }
    }
#endif

    endpoint->endpoint_state = MCA_BTL_IB_CONNECTED;
    endpoint->endpoint_btl->device->non_eager_rdma_endpoints++;

    /* The connection is correctly setup. Now we can decrease the
       event trigger. */
    opal_progress_event_users_decrement();

    if(MCA_BTL_XRC_ENABLED) {
        while(master && !opal_list_is_empty(&endpoint->ib_addr->pending_ep)) {
            ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep);
            ep = (mca_btl_openib_endpoint_t *)ep_item;
            if (OMPI_SUCCESS !=
                ompi_btl_openib_connect_base_start(endpoint->endpoint_local_cpc,
                                                   ep)) {
                BTL_ERROR(("Failed to connect pending endpoint\n"));
            }
        }
        OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
    }


    /* Process pending packet on the endpoint */

    /* While there are frags in the list, process them */
    while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) {
        frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags));
        frag = to_send_frag(frag_item);
        /* We need to post this one */

        if (OMPI_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) {
		     BTL_ERROR(("Error posting send"));
		}
    }
    OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);

    /* if upper layer called put or get before connection moved to connected
     * state then we restart them here */
    mca_btl_openib_frag_progress_pending_put_get(endpoint,
            mca_btl_openib_component.rdma_qp);
}
Exemple #16
0
static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache,
                             ib_sa_mad_t *req_mad,
                             ib_sa_mad_t *resp_mad,
                             struct ibv_send_wr *swr,
                             uint16_t lid,
                             uint16_t rem_lid)
{
    struct ibv_send_wr *bswr;
    struct ibv_wc wc;
    struct timeval get_sl_rec_last_sent, get_sl_rec_last_poll;
    struct ibv_recv_wr *brwr;
    int got_sl_value, get_sl_rec_retries, rc, ne, i;
    ib_path_rec_t *req_path_record = ib_sa_mad_get_payload_ptr(req_mad);
    ib_path_rec_t *resp_path_record = ib_sa_mad_get_payload_ptr(resp_mad);

    got_sl_value = 0;
    get_sl_rec_retries = 0;

    rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr);
    if (0 != rc) {
        BTL_ERROR(("error posting receive on QP [0x%x] rc says: %s [%d]",
                   cache->qp->qp_num, strerror(rc), rc));
        return OPAL_ERROR;
    }

    while (0 == got_sl_value) {
        rc = ibv_post_send(cache->qp, swr, &bswr);
        if (0 != rc) {
            BTL_ERROR(("error posting send on QP [0x%x] rc says: %s [%d]",
                       cache->qp->qp_num, strerror(rc), rc));
            return OPAL_ERROR;
        }
        gettimeofday(&get_sl_rec_last_sent, NULL);

        while (0 == got_sl_value) {
            ne = ibv_poll_cq(cache->cq, 1, &wc);
            if (ne > 0 && IBV_WC_RECV == wc.opcode) {
                /* We only care about the status of receive work requests.    */
                /* If the status of the send work request was anything other  */
                /* than success, we'll eventually retransmit, so ignore them. */
                if (0 == resp_mad->status &&
                    req_path_record->slid == htons(lid) &&
                    req_path_record->dlid == htons(rem_lid) &&
                    IBV_WC_SUCCESS == wc.status &&
                    wc.byte_len >= MAD_BLOCK_SIZE &&
                    resp_mad->trans_id == req_mad->trans_id) {
                    /* Everything matches, so we have the desired SL */
                    cache->sl_values[rem_lid] = ib_path_rec_sl(resp_path_record);
                    got_sl_value = 1;
                    break;
                }
                /* Probably bad status, unlikely bad lid match. We will */
                /* ignore response and let it time out so that we do a  */
                /* retry, but after a delay. Need to repost receive WR. */
                rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr);
                if (0 != rc) {
                    BTL_ERROR(("error posing receive on QP[%x] rc says: %s [%d]",
                               cache->qp->qp_num, strerror(rc), rc));
                    return OPAL_ERROR;
                }
            } else if (0 == ne) {    /* poll did not find anything */
                gettimeofday(&get_sl_rec_last_poll, NULL);
                i = get_sl_rec_last_poll.tv_sec - get_sl_rec_last_sent.tv_sec;
                i = (i * 1000000) +
                    get_sl_rec_last_poll.tv_usec - get_sl_rec_last_sent.tv_usec;
                if (i > GET_SL_REC_RETRIES_TIMEOUT_MS) {
                    get_sl_rec_retries++;
                    BTL_VERBOSE(("[%d/%d] retries to get PathRecord",
                            get_sl_rec_retries, MAX_GET_SL_REC_RETRIES));
                    if (get_sl_rec_retries > MAX_GET_SL_REC_RETRIES) {
                        BTL_ERROR(("No response from SA after %d retries",
                                MAX_GET_SL_REC_RETRIES));
                        return OPAL_ERROR;
                    }
                    /* Need to retransmit request. We must make a new TID */
                    /* so the SM doesn't see it as the same request.      */
                    req_mad->trans_id += hton64(1);
                    break;
                }
                usleep(100);  /* otherwise pause before polling again */
            } else if (ne < 0) {
                BTL_ERROR(("error polling CQ returned %d\n", ne));
                return OPAL_ERROR;
            }
        }
    }
    return 0;
}
void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
        const int qp)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_send_control_frag_t* frag;
    mca_btl_openib_rdma_credits_header_t *credits_hdr;
    int rc;
    bool do_rdma = false;
    int32_t cm_return;

    frag = endpoint->qps[qp].credit_frag;

    if(OPAL_UNLIKELY(NULL == frag)) {
        frag = alloc_control_frag(openib_btl);
        frag->qp_idx = qp;
        endpoint->qps[qp].credit_frag = frag;
        /* set those once and forever */
        to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
        to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
        to_base_frag(frag)->base.des_cbdata = NULL;
        to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;;
        to_com_frag(frag)->endpoint = endpoint;
        frag->hdr->tag = MCA_BTL_TAG_BTL;
        to_base_frag(frag)->segment.base.seg_len =
            sizeof(mca_btl_openib_rdma_credits_header_t);
    }

    assert(frag->qp_idx == qp);
    credits_hdr = (mca_btl_openib_rdma_credits_header_t*)
        to_base_frag(frag)->segment.base.seg_addr.pval;
    if(OMPI_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) {
        do_rdma = true;
    } else {
        if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) >
                (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
            BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
            return;
        }
     }

    BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);

    frag->hdr->cm_seen = 0;
    BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    if(cm_return > 255) {
        frag->hdr->cm_seen = 255;
        cm_return -= 255;
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    } else {
        frag->hdr->cm_seen = cm_return;
    }

    BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
    credits_hdr->qpn = qp;
    credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS;

    if(endpoint->nbo)
         BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr);

    qp_reset_signal_count(endpoint, qp);
    if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0)
        return;

    if(endpoint->nbo) {
        BTL_OPENIB_HEADER_NTOH(*frag->hdr);
        BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr);
    }
    BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
    OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
            frag->hdr->credits);
    OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
            credits_hdr->rdma_credits);
    if(do_rdma)
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
    else
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);

    BTL_ERROR(("error posting send request errno %d says %s", rc,
                strerror(errno)));
}
Exemple #18
0
static int init_device(struct ibv_context *context_arg,
                       struct mca_btl_openib_sa_qp_cache *cache,
                       uint32_t port_num)
{
    struct ibv_ah_attr aattr;
    struct ibv_port_attr pattr;
    int rc;

    cache->context = ibv_open_device(context_arg->device);
    if (NULL == cache->context) {
        BTL_ERROR(("error obtaining device context for %s errno says %s",
                    ibv_get_device_name(context_arg->device), strerror(errno)));
        return OPAL_ERROR;
    }
    cache->device_name = strdup(ibv_get_device_name(cache->context->device));
    cache->port_num = port_num;

    /* init all sl_values to be SL_NOT_PRESENT */
    memset(&cache->sl_values, SL_NOT_PRESENT, sizeof(cache->sl_values));

    cache->next = sa_qp_cache;
    sa_qp_cache = cache;

    /* allocate the protection domain for the device */
    cache->pd = ibv_alloc_pd(cache->context);
    if (NULL == cache->pd) {
        BTL_ERROR(("error allocating protection domain for %s errno says %s",
                    ibv_get_device_name(context_arg->device), strerror(errno)));
        return OPAL_ERROR;
    }

    /* register memory region */
    cache->mr = ibv_reg_mr(cache->pd, cache->send_recv_buffer,
            sizeof(cache->send_recv_buffer),
            IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
    if (NULL == cache->mr) {
        BTL_ERROR(("error registering memory region, errno says %s", strerror(errno)));
        return OPAL_ERROR;
    }

    /* init the ud qp */
    rc = init_ud_qp(context_arg, cache);
    if (OPAL_ERROR == rc) {
        return OPAL_ERROR;
    }

    rc = ibv_query_port(cache->context, cache->port_num, &pattr);
    if (rc) {
        BTL_ERROR(("error getting port attributes for device %s "
                    "port number %d errno says %s",
                    ibv_get_device_name(context_arg->device),
                    cache->port_num, strerror(errno)));
        return OPAL_ERROR;
    }

    /* create address handle  */
    memset(&aattr, 0, sizeof(aattr));
    aattr.dlid = pattr.sm_lid;
    aattr.sl = pattr.sm_sl;
    aattr.port_num = cache->port_num;
    cache->ah = ibv_create_ah(cache->pd, &aattr);
    if (NULL == cache->ah) {
        BTL_ERROR(("error creating address handle: %s", strerror(errno)));
        return OPAL_ERROR;
    }

    memset(&(cache->rwr), 0, sizeof(cache->rwr));
    cache->rwr.num_sge = 1;
    cache->rwr.sg_list = &(cache->rsge);
    memset(&(cache->rsge), 0, sizeof(cache->rsge));
    cache->rsge.addr = (uint64_t)(void *)
        (cache->send_recv_buffer + MAD_BLOCK_SIZE);
    cache->rsge.length = MAD_BLOCK_SIZE + 40;
    cache->rsge.lkey = cache->mr->lkey;

    return 0;
}
mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_create(ompi_proc_t* ompi_proc)
{
    int rc;
    size_t size;
    mca_btl_tcp2_proc_t* btl_proc;
    uint64_t hash = orte_util_hash_name(&ompi_proc->proc_name);

    OPAL_THREAD_LOCK(&mca_btl_tcp2_component.tcp_lock);
    rc = opal_hash_table_get_value_uint64(&mca_btl_tcp2_component.tcp_procs, 
                                          hash, (void**)&btl_proc);
    if(OMPI_SUCCESS == rc) {
        OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock);
        return btl_proc;
    }

    btl_proc = OBJ_NEW(mca_btl_tcp2_proc_t);
    if(NULL == btl_proc)
        return NULL;
    btl_proc->proc_ompi = ompi_proc;
    
    /* add to hash table of all proc instance */
    opal_hash_table_set_value_uint64(&mca_btl_tcp2_component.tcp_procs,
                                     hash, btl_proc);
    OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock);

    /* lookup tcp parameters exported by this proc */
    rc = ompi_modex_recv( &mca_btl_tcp2_component.super.btl_version,
                                  ompi_proc,
                                  (void**)&btl_proc->proc_addrs,
                                  &size );
    if(rc != OMPI_SUCCESS) {
        BTL_ERROR(("mca_base_modex_recv: failed with return value=%d", rc));
        OBJ_RELEASE(btl_proc);
        return NULL;
    }
    if(0 != (size % sizeof(mca_btl_tcp2_addr_t))) {
        BTL_ERROR(("mca_base_modex_recv: invalid size %lu: btl-size: %lu\n",
          (unsigned long) size, (unsigned long)sizeof(mca_btl_tcp2_addr_t)));
        return NULL;
    }
    btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp2_addr_t);

    /* allocate space for endpoint array - one for each exported address */
    btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
        malloc((1 + btl_proc->proc_addr_count) *
                sizeof(mca_btl_base_endpoint_t*));
    if(NULL == btl_proc->proc_endpoints) {
        OBJ_RELEASE(btl_proc);
        return NULL;
    }
    if(NULL == mca_btl_tcp2_component.tcp_local && ompi_proc == ompi_proc_local()) {
        mca_btl_tcp2_component.tcp_local = btl_proc;
    }
    {
        /* convert the OMPI addr_family field to OS constants,
         * so we can check for AF_INET (or AF_INET6) and don't have
         * to deal with byte ordering anymore.
         */
        unsigned int i;
        for (i = 0; i < btl_proc->proc_addr_count; i++) {
            if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) {
                btl_proc->proc_addrs[i].addr_family = AF_INET;
            }
#if OPAL_WANT_IPV6
            if (MCA_BTL_TCP_AF_INET6 == btl_proc->proc_addrs[i].addr_family) {
                btl_proc->proc_addrs[i].addr_family = AF_INET6;
            }
#endif
        }
    }
    return btl_proc;
}
Exemple #20
0
/* This func. opens XRC domain */
int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_device_t *device)
{
    int len;
    char *xrc_file_name;
    const char *dev_name;
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
    struct ibv_xrcd_init_attr xrcd_attr;
#endif

    dev_name = ibv_get_device_name(device->ib_dev);
    len = asprintf(&xrc_file_name,
                   "%s"OPAL_PATH_SEP"openib_xrc_domain_%s",
                   opal_process_info.job_session_dir, dev_name);
    if (0 > len) {
        BTL_ERROR(("Failed to allocate memomry for XRC file name: %s\n",
                   strerror(errno)));
        return OPAL_ERROR;
    }

    device->xrc_fd = open(xrc_file_name, O_CREAT, S_IWUSR|S_IRUSR);
    if (0 > device->xrc_fd) {
        BTL_ERROR(("Failed to open XRC domain file %s, errno says %s\n",
                   xrc_file_name,strerror(errno)));
        free(xrc_file_name);
        return OPAL_ERROR;
    }
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
    memset(&xrcd_attr, 0, sizeof xrcd_attr);
    xrcd_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS;
    xrcd_attr.fd = device->xrc_fd;
    xrcd_attr.oflags = O_CREAT;
    device->xrcd = ibv_open_xrcd(device->ib_dev_context, &xrcd_attr);
    if (NULL == device->xrcd) {
#else
    device->xrc_domain = ibv_open_xrc_domain(device->ib_dev_context, device->xrc_fd, O_CREAT);
    if (NULL == device->xrc_domain) {
#endif
        BTL_ERROR(("Failed to open XRC domain\n"));
        close(device->xrc_fd);
        free(xrc_file_name);
        return OPAL_ERROR;
    }

    return OPAL_SUCCESS;
}

/* This func. closes XRC domain */
int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_device_t *device)
{
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
    if (NULL == device->xrcd) {
#else
    if (NULL == device->xrc_domain) {
#endif
        /* No XRC domain, just exit */
        return OPAL_SUCCESS;
    }
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
    if (ibv_close_xrcd(device->xrcd)) {
#else
    if (ibv_close_xrc_domain(device->xrc_domain)) {
#endif
        BTL_ERROR(("Failed to close XRC domain, errno %d says %s\n",
                   device->xrc_fd, strerror(errno)));
        return OPAL_ERROR;
    }
    /* do we need to check exit status */
    if (close(device->xrc_fd)) {
        BTL_ERROR(("Failed to close XRC file descriptor, errno %d says %s\n",
                   device->xrc_fd, strerror(errno)));
        return OPAL_ERROR;
    }
    return OPAL_SUCCESS;
}

static void ib_address_constructor(ib_address_t *ib_addr)
{
    ib_addr->key = NULL;
    ib_addr->subnet_id = 0;
    ib_addr->lid = 0;
    ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
    ib_addr->qp = NULL;
    OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_mutex_t);
    OBJ_CONSTRUCT(&ib_addr->pending_ep, opal_list_t);
}

static void ib_address_destructor(ib_address_t *ib_addr)
{
    if (NULL != ib_addr->key) {
        free(ib_addr->key);
    }
    OBJ_DESTRUCT(&ib_addr->addr_lock);
    OBJ_DESTRUCT(&ib_addr->pending_ep);
}

static int ib_address_init(ib_address_t *ib_addr, uint16_t lid, uint64_t s_id, opal_jobid_t ep_jobid)
{
    ib_addr->key = malloc(SIZE_OF3(s_id, lid, ep_jobid));
    if (NULL == ib_addr->key) {
        BTL_ERROR(("Failed to allocate memory for key\n"));
        return OPAL_ERROR;
    }
    memset(ib_addr->key, 0, SIZE_OF3(s_id, lid, ep_jobid));
    /* creating the key = lid + s_id + ep_jobid */
    memcpy(ib_addr->key, &lid, sizeof(lid));
    memcpy((void*)((char*)ib_addr->key + sizeof(lid)), &s_id, sizeof(s_id));
    memcpy((void*)((char*)ib_addr->key + sizeof(lid) + sizeof(s_id)),
           &ep_jobid, sizeof(ep_jobid));
    /* caching lid and subnet id */
    ib_addr->subnet_id = s_id;
    ib_addr->lid = lid;

    return OPAL_SUCCESS;
}

/* Create new entry in hash table for subnet_id and lid,
 * update the endpoint pointer.
 * Before call to this function you need to protect with
 */
int mca_btl_openib_ib_address_add_new (uint16_t lid, uint64_t s_id,
                                       opal_jobid_t ep_jobid, mca_btl_openib_endpoint_t *ep)
{
    void *tmp;
    int ret = OPAL_SUCCESS;
    struct ib_address_t *ib_addr = OBJ_NEW(ib_address_t);

    ret = ib_address_init(ib_addr, lid, s_id, ep_jobid);
    if (OPAL_SUCCESS != ret ) {
        BTL_ERROR(("XRC Internal error. Failed to init ib_addr\n"));
        OBJ_DESTRUCT(ib_addr);
        return ret;
    }
    /* is it already in the table ?*/
    OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
    if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(&mca_btl_openib_component.ib_addr_table,
            ib_addr->key,
            SIZE_OF3(s_id, lid, ep_jobid), &tmp)) {
        /* It is new one, lets put it on the table */
        ret = opal_hash_table_set_value_ptr(&mca_btl_openib_component.ib_addr_table,
                                            ib_addr->key, SIZE_OF3(s_id, lid, ep_jobid), (void*)ib_addr);
        if (OPAL_SUCCESS != ret) {
            BTL_ERROR(("XRC Internal error."
                       " Failed to add element to mca_btl_openib_component.ib_addr_table\n"));
            OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
            OBJ_DESTRUCT(ib_addr);
            return ret;
        }
        /* update the endpoint with pointer to ib address */
        ep->ib_addr = ib_addr;
    } else {
        /* so we have this one in the table, just add the pointer to the endpoint */
        ep->ib_addr = (ib_address_t *)tmp;
        assert(lid == ep->ib_addr->lid && s_id == ep->ib_addr->subnet_id);
        OBJ_DESTRUCT(ib_addr);
    }
    OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);

    return ret;
}
Exemple #21
0
/**
 * This function is used to send a message to the remote side
 * indicating the endpoint is broken and telling the remote side to
 * brings its endpoint down as well.  This is needed because there are
 * cases where only one side of the connection determines that the
 * there was a problem.
 * @param endpoint Pointer to endpoint with error
 * @param type Type of message to be sent, can be one of two types
 * @param index When sending RDMA error message, index is non zero
 */
static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_module_t* newbtl = NULL;
    bool found = false;
    mca_btl_openib_broken_connection_header_t *bc_hdr;
    mca_btl_openib_send_control_frag_t* frag;
    mca_btl_base_endpoint_t* newep;
    int i, rc;
    opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal;

    /* First, find a different BTL than this one that got the
     * error to send the message over. */
    for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
        if (mca_btl_openib_component.openib_btls[i] != openib_btl) {
            newbtl = mca_btl_openib_component.openib_btls[i];
            break;
        }
    }
    if (NULL == newbtl) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No BTL found");
        /* If we cannot find one, then just return. */
        return;
    }

    /* Now, find the endpoint associated with it.  The device
     * associated with the BTL has the list of all the
     * endpoints. */
    for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) {
        newep = (mca_btl_openib_endpoint_t*)
            opal_pointer_array_get_item(newbtl->device->endpoints, i);
        if (NULL == newep) {
            continue;
        }
        if (newep->endpoint_proc->proc_opal == remote_proc) {
            found = true;
            break;
        }
    }
    if (false == found) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No endpoint found");
        /* If we cannot find a match, then just return. */
        return;
    }

    frag = alloc_control_frag(newbtl);
    if(NULL == frag) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No frag space");
        /* If no frag available, then just return. */
        return;
    }

    to_base_frag(frag)->base.des_cbfunc =
        mca_btl_openib_endpoint_notify_cb;
    to_base_frag(frag)->base.des_cbdata = NULL;
    to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
    to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
    to_base_frag(frag)->segment.base.seg_len =
        sizeof(mca_btl_openib_broken_connection_header_t);
    to_com_frag(frag)->endpoint = newep;

    frag->hdr->tag = MCA_BTL_TAG_IB;
    bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval;
    bc_hdr->control.type = type;
    bc_hdr->lid = endpoint->endpoint_btl->port_info.lid;
    bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id;
    bc_hdr->vpid = OPAL_PROC_MY_NAME.vpid;
    bc_hdr->index = index;

    if(newep->nbo) {
        BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr));
    }
    rc = mca_btl_openib_endpoint_send(newep, frag);
    if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) {
        return;
    }

    MCA_BTL_IB_FRAG_RETURN(frag);
    BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno)));
    return;
}
Exemple #22
0
/* Function handle async thread commands */
static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *devices_poll)
{
    struct pollfd *async_pollfd_tmp;
    int fd,flags,j;
    /* Got command from main thread */
    if (read(devices_poll->async_pollfd[0].fd, &fd, sizeof(int)) < 0) {
        BTL_ERROR(("Read failed [%d]",errno));
        return OMPI_ERROR;
    }
    BTL_VERBOSE(("GOT event from -> %d",fd));
    if (fd > 0) {
        BTL_VERBOSE(("Adding device [%d] to async event poll[%d]",
                     fd, devices_poll->active_poll_size));
        flags = fcntl(fd, F_GETFL);
        if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) {
            BTL_ERROR(("Failed to change file descriptor of async event"));
            return OMPI_ERROR;
        }
        if ((devices_poll->active_poll_size + 1) > devices_poll->poll_size) {
            devices_poll->poll_size+=devices_poll->poll_size;
            async_pollfd_tmp = malloc(sizeof(struct pollfd) * devices_poll->poll_size);
            if (NULL == async_pollfd_tmp) {
                BTL_ERROR(("Failed malloc: %s:%d.  "
                            "Fatal error, stoping asynch event thread"
                            , __FILE__, __LINE__));
                return OMPI_ERROR;
            }
            memcpy (async_pollfd_tmp,devices_poll->async_pollfd,
                    sizeof(struct pollfd) * (devices_poll->active_poll_size));
            free(devices_poll->async_pollfd);
            devices_poll->async_pollfd = async_pollfd_tmp;
        }
        devices_poll->async_pollfd[devices_poll->active_poll_size].fd = fd;
        devices_poll->async_pollfd[devices_poll->active_poll_size].events = POLLIN;
        devices_poll->async_pollfd[devices_poll->active_poll_size].revents = 0;
        devices_poll->active_poll_size++;
        if (OMPI_SUCCESS != send_command_comp(fd)) {
            return OMPI_ERROR;
        }
    } else if (fd < 0) {
        bool fd_found = false;
        /* Removing device from poll */
        fd = -(fd);
        BTL_VERBOSE(("Removing device [%d] from async event poll [%d]",
                     fd, devices_poll->active_poll_size));
        if (devices_poll->active_poll_size > 1) {
            for (j=0; (j < devices_poll->active_poll_size || !fd_found); j++) {
                if (devices_poll->async_pollfd[j].fd == fd) {
                    devices_poll->async_pollfd[j].fd =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].fd;
                    devices_poll->async_pollfd[j].events =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].events;
                    devices_poll->async_pollfd[j].revents =
                        devices_poll->async_pollfd[devices_poll->active_poll_size-1].revents;
                    fd_found = true;
                }
            }
            if (!fd_found) {
                BTL_ERROR(("Requested FD[%d] was not found in poll array",fd));
                return OMPI_ERROR;
            }
        }
        devices_poll->active_poll_size--;
        if (OMPI_SUCCESS != send_command_comp(-(fd))) {
            return OMPI_ERROR;
        }
    } else {
        /* Got 0 - command to close the thread */
        BTL_VERBOSE(("Async event thread exit"));
        free(devices_poll->async_pollfd);
        return_status = OMPI_SUCCESS;
        pthread_exit(&return_status);
    }
    return OMPI_SUCCESS;
}
Exemple #23
0
/*
 * Construct/destruct an endpoint structure.
 */
static void endpoint_construct(mca_btl_base_endpoint_t* endpoint)
{
    int i;

    endpoint->endpoint_module = NULL;
    endpoint->endpoint_proc = NULL;
    endpoint->endpoint_proc_index = -1;
    endpoint->endpoint_exiting = false;
    endpoint->endpoint_connectivity_checked = false;

    for (i=0; i<USNIC_NUM_CHANNELS; ++i) {
        endpoint->endpoint_remote_addr.qp_num[i] = 0;
    }
    endpoint->endpoint_remote_addr.gid.global.subnet_prefix = 0;
    endpoint->endpoint_remote_addr.gid.global.interface_id = 0;
    endpoint->endpoint_remote_ah = NULL;

    endpoint->endpoint_send_credits = 8;

    /* list of fragments queued to be sent */
    OBJ_CONSTRUCT(&endpoint->endpoint_frag_send_queue, opal_list_t);

    endpoint->endpoint_next_frag_id = 1;
    endpoint->endpoint_acktime = 0;

    /* endpoint starts not-ready-to-send */
    endpoint->endpoint_ready_to_send = 0;
    endpoint->endpoint_ack_needed = false;

    /* clear sent/received sequence number array */
    memset(endpoint->endpoint_sent_segs, 0,
            sizeof(endpoint->endpoint_sent_segs));
    memset(endpoint->endpoint_rcvd_segs, 0,
            sizeof(endpoint->endpoint_rcvd_segs));

    /*
     * Make a new OPAL hotel for this module
     * "hotel" is a construct used for triggering segment retransmission
     * due to timeout
     */
    OBJ_CONSTRUCT(&endpoint->endpoint_hotel, opal_hotel_t);
    opal_hotel_init(&endpoint->endpoint_hotel, 
                    WINDOW_SIZE,
                    mca_btl_usnic_component.retrans_timeout,
                    0,
                    ompi_btl_usnic_ack_timeout);

    /* Setup this endpoint's list links */
    OBJ_CONSTRUCT(&(endpoint->endpoint_ack_li), opal_list_item_t);
    OBJ_CONSTRUCT(&(endpoint->endpoint_endpoint_li), opal_list_item_t);
    endpoint->endpoint_ack_needed = false;

    /* fragment reassembly info */
    endpoint->endpoint_rx_frag_info =
        calloc(sizeof(struct ompi_btl_usnic_rx_frag_info_t), MAX_ACTIVE_FRAGS);
    assert(NULL != endpoint->endpoint_rx_frag_info);
    if (OPAL_UNLIKELY(endpoint->endpoint_rx_frag_info == NULL)) {
        BTL_ERROR(("calloc returned NULL -- this should not happen!"));
        ompi_btl_usnic_exit();
        /* Does not return */
    }
}
Exemple #24
0
/* Function handle async device events */
static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index)
{
    int j;
    mca_btl_openib_device_t *device = NULL;
    struct ibv_async_event event;
    bool xrc_event = false;
    int event_type;

    /* We need to find correct device and process this event */
    for (j=0; j < mca_btl_openib_component.ib_num_btls; j++) {
        if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd ==
                devices_poll->async_pollfd[index].fd ) {
            device = mca_btl_openib_component.openib_btls[j]->device;
            break;
        }
    }
    if (NULL != device) {
        if (ibv_get_async_event((struct ibv_context *)device->ib_dev_context,&event) < 0) {
            if (EWOULDBLOCK == errno) {
                /* No event found ?
                 * It was handled by somebody other */
                return OMPI_SUCCESS;
            } else {
                BTL_ERROR(("Failed to get async event"));
                return OMPI_ERROR;
            }
        }

        event_type = event.event_type;
#if HAVE_XRC
        /* is it XRC event ?*/
        if (IBV_XRC_QP_EVENT_FLAG & event.event_type) {
            xrc_event = true;
            /* Clean the bitnd handel as usual */
            event_type ^= IBV_XRC_QP_EVENT_FLAG;
        }
#endif
        switch(event_type) {
            case IBV_EVENT_PATH_MIG:
                BTL_ERROR(("Alternative path migration event reported"));
                if (APM_ENABLED) {
                    BTL_ERROR(("Trying to find additional path..."));
                    if (!xrc_event)
                        mca_btl_openib_load_apm(event.element.qp,
                                qp2endpoint(event.element.qp, device));
#if HAVE_XRC
                    else
                        mca_btl_openib_load_apm_xrc_rcv(event.element.xrc_qp_num,
                                xrc_qp2endpoint(event.element.xrc_qp_num, device));
#endif
                }
                break;
            case IBV_EVENT_DEVICE_FATAL:
                /* Set the flag to fatal */
                device->got_fatal_event = true;
                /* It is not critical to protect the counter */
                OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1);
            case IBV_EVENT_CQ_ERR:
            case IBV_EVENT_QP_FATAL:
            case IBV_EVENT_QP_REQ_ERR:
            case IBV_EVENT_QP_ACCESS_ERR:
            case IBV_EVENT_PATH_MIG_ERR:
            case IBV_EVENT_SRQ_ERR:
                orte_show_help("help-mpi-btl-openib.txt", "of error event",
                    true,orte_process_info.nodename, orte_process_info.pid,
                    event_type,
                    openib_event_to_str((enum ibv_event_type)event_type),
                    xrc_event ? "true" : "false");
                break;
            case IBV_EVENT_PORT_ERR:
                orte_show_help("help-mpi-btl-openib.txt", "of error event",
                    true,orte_process_info.nodename, orte_process_info.pid,
                    event_type,
                    openib_event_to_str((enum ibv_event_type)event_type),
                    xrc_event ? "true" : "false");
                /* Set the flag to indicate port error */
                device->got_port_event = true;
                OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1);
                break;
            case IBV_EVENT_COMM_EST:
            case IBV_EVENT_PORT_ACTIVE:
            case IBV_EVENT_SQ_DRAINED:
            case IBV_EVENT_LID_CHANGE:
            case IBV_EVENT_PKEY_CHANGE:
            case IBV_EVENT_SM_CHANGE:
            case IBV_EVENT_QP_LAST_WQE_REACHED:
#if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER
            case IBV_EVENT_CLIENT_REREGISTER:
#endif
                break;
            /* The event is signaled when number of prepost receive WQEs is going
                                            under predefined threshold - srq_limit */
            case IBV_EVENT_SRQ_LIMIT_REACHED:
                if(OMPI_SUCCESS !=
                         btl_openib_async_srq_limit_event(event.element.srq)) {
                    return OMPI_ERROR;
                }

                break;
            default:
                orte_show_help("help-mpi-btl-openib.txt", "of unknown event",
                        true,orte_process_info.nodename, orte_process_info.pid,
                        event_type, xrc_event ? "true" : "false");
        }
        ibv_ack_async_event(&event);
    } else {
        /* if (device == NULL), then failed to locate the device!
           This should never happen... */
        BTL_ERROR(("Failed to find device with FD %d.  "
                   "Fatal error, stoping asynch event thread",
                   devices_poll->async_pollfd[index].fd));
        return OMPI_ERROR;
    }
    return OMPI_SUCCESS;
}
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq)
{
    mca_btl_ugni_post_descriptor_t *post_desc = NULL;
    gni_cq_entry_t event_data = 0;
    gni_post_descriptor_t *desc;
    uint32_t recoverable = 1;
    gni_return_t grc;
    gni_cq_handle_t the_cq;

    the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq;

    OPAL_THREAD_LOCK(&ugni_module->device->dev_lock);
    grc = GNI_CqGetEvent (the_cq, &event_data);
    if (GNI_RC_NOT_DONE == grc) {
        OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
        return 0;
    }

    if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
        /* TODO -- need to handle overrun -- how do we do this without an event?
           will the event eventually come back? Ask Cray */
        BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc]));
        OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);

        return opal_common_rc_ugni_to_opal (grc);
    }

    grc = GNI_GetCompleted (the_cq, event_data, &desc);
    OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock);
    if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) {
        BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc]));
        return opal_common_rc_ugni_to_opal (grc);
    }

    post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc);

    if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) {
        (void) GNI_CqErrorRecoverable (event_data, &recoverable);

        if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
                          !recoverable)) {
            char char_buffer[1024];
            GNI_CqErrorStr (event_data, char_buffer, 1024);
            /* give up */
            BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc,
                       recoverable, char_buffer));
#if OPAL_ENABLE_DEBUG
            btl_ugni_dump_post_desc (post_desc);
#endif
            mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR);

            return OPAL_ERROR;
        }

        mca_btl_ugni_repost (ugni_module, post_desc);

        return 0;
    }

    mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc));

    return 1;
}
Exemple #26
0
static void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char* msg)
{
    char src[64];
    char dst[64];
    int sndbuf,rcvbuf,nodelay,flags;
#if OPAL_WANT_IPV6
    struct sockaddr_storage inaddr;
#else
    struct sockaddr_in inaddr;
#endif
    opal_socklen_t obtlen;
    opal_socklen_t addrlen = sizeof(inaddr);

    getsockname(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen);
#if OPAL_WANT_IPV6
    {
        char *address;
        address = (char *) opal_net_get_hostname((struct sockaddr*) &inaddr);
        if (NULL != address) {
            sprintf(src, "%s", address);
        }
    }
#else
    sprintf(src, "%s", inet_ntoa(inaddr.sin_addr));
#endif
    getpeername(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen);
#if OPAL_WANT_IPV6
    {
        char *address;
        address = (char *) opal_net_get_hostname ((struct sockaddr*) &inaddr);
        if (NULL != address) {
            sprintf(dst, "%s", address);
        }
    }
#else
    sprintf(dst, "%s", inet_ntoa(inaddr.sin_addr));
#endif

    if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) {
        BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", 
                   strerror(opal_socket_errno), opal_socket_errno));
    }

#if defined(SO_SNDBUF)
    obtlen = sizeof(sndbuf);
    if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_SNDBUF, (char *)&sndbuf, &obtlen) < 0) {
        BTL_ERROR(("SO_SNDBUF option: %s (%d)",
                   strerror(opal_socket_errno), opal_socket_errno));
    }
#else
    sndbuf = -1;
#endif
#if defined(SO_RCVBUF)
    obtlen = sizeof(rcvbuf);
    if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_RCVBUF, (char *)&rcvbuf, &obtlen) < 0) {
        BTL_ERROR(("SO_RCVBUF option: %s (%d)", 
                   strerror(opal_socket_errno), opal_socket_errno));
    }
#else
    rcvbuf = -1;
#endif
#if defined(TCP_NODELAY)
    obtlen = sizeof(nodelay);
    if(getsockopt(btl_endpoint->endpoint_sd, IPPROTO_TCP, TCP_NODELAY, (char *)&nodelay, &obtlen) < 0) {
        BTL_ERROR(("TCP_NODELAY option: %s (%d)", 
                   strerror(opal_socket_errno), opal_socket_errno));
    }
#else
    nodelay = 0;
#endif

    BTL_VERBOSE(("%s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x", 
        msg, src, dst, nodelay, sndbuf, rcvbuf, flags));
}