Beispiel #1
0
static inline int mca_btl_ugni_ep_connect_start (mca_btl_base_endpoint_t *ep) {
    int rc;

    rc = mca_btl_ugni_ep_connect_rdma (ep);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        return rc;
    }

    BTL_VERBOSE(("initiaiting connection to remote peer with address: %u id: %u proc: %p",
                 ep->common->ep_rem_addr, ep->common->ep_rem_id, (void *)ep->peer_proc));

    /* bind endpoint to remote address */
    /* we bind two endpoints to seperate out local smsg completion and local fma completion */
    rc = opal_common_ugni_ep_create (ep->common, ep->btl->smsg_local_cq, &ep->smsg_ep_handle);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        return rc;
    }

    /* build connection data */
    rc = mca_btl_ugni_ep_smsg_get_mbox (ep);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        return rc;
    }

    ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTING;

    memset (&ep->remote_attr, 0, sizeof (ep->remote_attr));

    BTL_VERBOSE(("btl/ugni connection to remote peer initiated"));

    return OPAL_SUCCESS;
}
Beispiel #2
0
mca_btl_base_descriptor_t *
mca_btl_scif_alloc(struct mca_btl_base_module_t *btl,
                   struct mca_btl_base_endpoint_t *endpoint,
                   uint8_t order, size_t size, uint32_t flags)
{
    mca_btl_scif_base_frag_t *frag = NULL;

    BTL_VERBOSE(("allocating fragment of size: %u", (unsigned int)size));

    if (size <= mca_btl_scif_module.super.btl_eager_limit) {
        (void) MCA_BTL_SCIF_FRAG_ALLOC_EAGER(endpoint, frag);
    }

    if (OPAL_UNLIKELY(NULL == frag)) {
        return NULL;
    }

    BTL_VERBOSE(("btl/scif_module allocated frag of size: %u, flags: %x. frag = %p",
                 (unsigned int)size, flags, (void *) frag));

    frag->base.des_flags = flags;
    frag->base.order = order;
    frag->base.des_segments = frag->segments;
    frag->base.des_segment_count = 1;

    frag->segments[0].seg_len       = size;

    return &frag->base;
}
Beispiel #3
0
static inline int mca_btl_scif_ep_get_buffer (mca_btl_base_endpoint_t *ep) {
    int rc;

    rc = posix_memalign ((void **) &ep->recv_buffer.buffer, opal_getpagesize(), mca_btl_scif_component.segment_size);
    if (0 > rc) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    memset (ep->recv_buffer.buffer, 0, mca_btl_scif_component.segment_size);

    ep->recv_buffer.scif_offset = scif_register (ep->scif_epd, ep->recv_buffer.buffer,
                                                 mca_btl_scif_component.segment_size, 0,
                                                 SCIF_PROT_READ | SCIF_PROT_WRITE, 0);
    if (SCIF_REGISTER_FAILED == ep->recv_buffer.scif_offset) {
        BTL_VERBOSE(("failed to register a scif buffer of size %d. errno = %d",
                     mca_btl_scif_component.segment_size, errno));
        free (ep->recv_buffer.buffer);
        ep->recv_buffer.buffer = NULL;
        return OPAL_ERROR;
    }

    ep->recv_buffer.startp = (uint32_t *) ep->recv_buffer.buffer;
    ep->recv_buffer.endp   = ep->recv_buffer.startp + 1;

    ep->recv_buffer.startp[0] = ep->recv_buffer.endp[0] = 64;

    BTL_VERBOSE(("allocated buffer of size %d bytes. with scif registration %lu",
                 mca_btl_scif_component.segment_size, (unsigned long) ep->recv_buffer.scif_offset));

    return OPAL_SUCCESS;
}
Beispiel #4
0
static void *mca_btl_scif_connect_accept (void *arg)
{
    struct scif_pollepd pollepd = {.epd = mca_btl_scif_module.scif_fd, .events = SCIF_POLLIN, .revents = 0};
    int rc;

    BTL_VERBOSE(("btl/scif: listening for new connections"));

    /* listen for connections */
    while (1) {
        pollepd.revents = 0;

        rc = scif_poll (&pollepd, 1, -1);
        if (1 == rc) {
            if (SCIF_POLLIN != pollepd.revents) {
                break;
            }

            rc = mca_btl_scif_ep_connect_start_passive ();
            if (OMPI_SUCCESS != rc) {
                BTL_VERBOSE(("btl/scif: error accepting scif connection"));
                continue;
            }
        } else {
            break;
        }
    }

    BTL_VERBOSE(("btl/scif: stopped listening for new connections"));

    return NULL;
}

int mca_btl_scif_del_procs (struct mca_btl_base_module_t *btl,
                            size_t nprocs, struct ompi_proc_t **procs,
                            struct mca_btl_base_endpoint_t **peers) {
    /* do nothing for now */
    return OMPI_SUCCESS;
}

static int scif_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg)
{
    mca_btl_scif_reg_t *scif_reg = (mca_btl_scif_reg_t *)reg;
    size_t size = (size_t)((uintptr_t) reg->bound - (uintptr_t) reg->base);
    int i;

    /* register the fragment with all connected endpoints */
    for (i = 0 ; i < (int) mca_btl_scif_module.endpoint_count ; ++i) {
        if ((off_t)-1 != scif_reg->registrations[i] &&
            MCA_BTL_SCIF_EP_STATE_CONNECTED == mca_btl_scif_module.endpoints[i].state) {
            (void) scif_unregister(mca_btl_scif_module.endpoints[i].scif_epd,
                                   scif_reg->registrations[i], size);
        }
    }

    free (scif_reg->registrations);

    return OMPI_SUCCESS;
}
Beispiel #5
0
/* send the eager rdma connect message to the remote endpoint */
static int mca_btl_openib_endpoint_send_eager_rdma(
    mca_btl_base_endpoint_t* endpoint)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_eager_rdma_header_t *rdma_hdr;
    mca_btl_openib_send_control_frag_t* frag;
    int rc;

    frag = alloc_control_frag(openib_btl);
    if(NULL == frag) {
        return -1;
    }

    to_base_frag(frag)->base.des_cbfunc =
        mca_btl_openib_endpoint_eager_rdma_connect_cb;
    to_base_frag(frag)->base.des_cbdata = NULL;
    to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
    to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
    to_base_frag(frag)->segment.seg_len =
        sizeof(mca_btl_openib_eager_rdma_header_t);
    to_com_frag(frag)->endpoint = endpoint;

    frag->hdr->tag = MCA_BTL_TAG_IB;
    rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval;
    rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
    rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
    rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval);
    BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64
                 ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n",
                 rdma_hdr->rkey,
                 rdma_hdr->rdma_start.lval,
                 rdma_hdr->rdma_start.pval,
                 rdma_hdr->rdma_start.ival,
                 rdma_hdr->control.type,
                 (int) sizeof(mca_btl_openib_eager_rdma_header_t)
                 ));

    if(endpoint->nbo) {
        BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr));

        BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n",
                     rdma_hdr->rkey,
                     rdma_hdr->rdma_start.lval,
                     rdma_hdr->rdma_start.pval,
                     rdma_hdr->rdma_start.ival
                     ));
    }
    rc = mca_btl_openib_endpoint_send(endpoint, frag);
    if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc)
        return OPAL_SUCCESS;

    MCA_BTL_IB_FRAG_RETURN(frag);
    BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno)));
    return rc;
}
Beispiel #6
0
static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) {
    mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep);
    gni_return_t grc;
    int rc;

    BTL_VERBOSE(("finishing connection. remote attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, "
                 "mem_hndl = {qword1 = %" PRIu64 ", qword2 = %" PRIu64 "}, mbox = %d, mbox_maxcredit = %d, "
                 "msg_maxsize = %d", ep->remote_attr->smsg_attr.msg_type, ep->remote_attr->smsg_attr.msg_buffer,
                 ep->remote_attr->smsg_attr.buff_size, ep->remote_attr->smsg_attr.mem_hndl.qword1,
                 ep->remote_attr->smsg_attr.mem_hndl.qword2, ep->remote_attr->smsg_attr.mbox_offset,
                 ep->remote_attr->smsg_attr.mbox_maxcredit, ep->remote_attr->smsg_attr.msg_maxsize));

    BTL_VERBOSE(("finishing connection. local attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, "
                 "mem_hndl = {qword1 = %" PRIu64 ", qword2 = %" PRIu64 "}, mbox = %d, mbox_maxcredit = %d, "
                 "msg_maxsize = %d", ep->mailbox->attr.smsg_attr.msg_type, ep->mailbox->attr.smsg_attr.msg_buffer,
                 ep->mailbox->attr.smsg_attr.buff_size, ep->mailbox->attr.smsg_attr.mem_hndl.qword1,
                 ep->mailbox->attr.smsg_attr.mem_hndl.qword2, ep->mailbox->attr.smsg_attr.mbox_offset,
                 ep->mailbox->attr.smsg_attr.mbox_maxcredit, ep->mailbox->attr.smsg_attr.msg_maxsize));

    grc = GNI_SmsgInit (ep->smsg_ep_handle.gni_handle, &ep->mailbox->attr.smsg_attr,
                        &ep->remote_attr->smsg_attr);
    if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) {
        BTL_ERROR(("error initializing SMSG protocol. rc = %d", grc));

        return mca_btl_rc_ugni_to_opal (grc);
    }

    /* set the local event data to the local index and the remote event data to my
     * index on the remote peer. This makes lookup of endpoints on completion take
     * a single lookup in the endpoints array. we will not be able to change the
     * remote peer's index in the endpoint's array after this point. */
    GNI_EpSetEventData (ep->smsg_ep_handle.gni_handle, ep->index, ep->remote_attr->index);

    ep->rmt_irq_mem_hndl = ep->remote_attr->rmt_irq_mem_hndl;
    ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTED;
    (void) opal_atomic_add_fetch_32 (&ep->smsg_ep_handle.device->smsg_connections, 1);

    /* send all pending messages */
    BTL_VERBOSE(("endpoint connected. posting %u sends", (unsigned int) opal_list_get_size (&ep->frag_wait_list)));

    rc = mca_btl_ugni_progress_send_wait_list (ep);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
        if (false == ep->wait_listed) {
            opal_list_append (&ugni_module->ep_wait_list, &ep->super);
            ep->wait_listed = true;
        }
        OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
    }

    free (ep->remote_attr);
    ep->remote_attr = NULL;

    return OPAL_SUCCESS;
}
Beispiel #7
0
static inline int mca_btl_scif_ep_connect_start_active (mca_btl_base_endpoint_t *ep) {
    int rc = OPAL_SUCCESS;

    BTL_VERBOSE(("initiaiting connection to remote peer %d with port: %u on local scif node: %u",
                 ep->peer_proc->proc_name.vpid, ep->port_id.port, ep->port_id.node));

    opal_mutex_lock (&ep->lock);
    do {
        if (MCA_BTL_SCIF_EP_STATE_INIT != ep->state) {
            /* the accept thread has already finished this connection */
            rc = OPAL_SUCCESS;
            break;
        }

        ep->state = MCA_BTL_SCIF_EP_STATE_CONNECTING;

        ep->scif_epd = scif_open ();
        if (OPAL_UNLIKELY(SCIF_OPEN_FAILED == ep->scif_epd)) {
            BTL_VERBOSE(("error creating new scif endpoint"));
            rc = OPAL_ERROR;
            break;
        }

        rc = scif_connect (ep->scif_epd, &ep->port_id);
        if (OPAL_UNLIKELY(-1 == rc)) {
            /* the connection attempt failed. this could mean the peer is currently
             * processing connections. we will to try again later. */
            BTL_VERBOSE(("error connecting to scif peer. %d", errno));
            rc = OPAL_ERR_RESOURCE_BUSY;
            break;
        }

        rc = scif_send (ep->scif_epd, &OPAL_PROC_MY_NAME, sizeof (OPAL_PROC_MY_NAME), SCIF_SEND_BLOCK);
        if (OPAL_UNLIKELY(-1 == rc)) {
            BTL_VERBOSE(("error in scif_send"));
            rc = OPAL_ERROR;
            break;
        }

        /* build connection data */
        rc = mca_btl_scif_ep_connect_finish (ep, false);
    } while (0);

    if (OPAL_SUCCESS != rc) {
        scif_close (ep->scif_epd);
        ep->scif_epd = -1;
        ep->state = MCA_BTL_SCIF_EP_STATE_INIT;
    }

    opal_mutex_unlock (&ep->lock);

    return rc;
}
/*
 * Connect function.  Start initiation of connections to a remote
 * peer.  We send our Queue Pair information over the RML/OOB
 * communication mechanism.  On completion of our send, a send
 * completion handler is called.
 */
static int xoob_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
                                     mca_btl_base_endpoint_t *endpoint)
{
    int rc = OMPI_SUCCESS;

    OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock);
    switch (endpoint->ib_addr->status) {
        case MCA_BTL_IB_ADDR_CLOSED:
            BTL_VERBOSE(("The IB addr: sid %" PRIx64 " lid %d"
                        "in MCA_BTL_IB_ADDR_CLOSED status,"
                        " sending ENDPOINT_XOOB_CONNECT_REQUEST\n",
                        endpoint->ib_addr->subnet_id,endpoint->ib_addr->lid));
            if (OMPI_SUCCESS != (rc = xoob_send_qp_create(endpoint))) {
                break;
            }

            /* Send connection info over to remote endpoint */
            endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
            endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTING;
            if (OMPI_SUCCESS !=
                    (rc = xoob_send_connect_data(endpoint, ENDPOINT_XOOB_CONNECT_REQUEST))) {
                BTL_ERROR(("Error sending connect request, error code %d", rc));
            }
            break;
        case MCA_BTL_IB_ADDR_CONNECTING:
            BTL_VERBOSE(("The IB addr: sid %" PRIx64 " lid %d"
                        "in MCA_BTL_IB_ADDR_CONNECTING status,"
                        " Subscribing to this address\n",
                        endpoint->ib_addr->subnet_id,endpoint->ib_addr->lid));
            /* some body already connectng to this machine, lets wait */
            opal_list_append(&endpoint->ib_addr->pending_ep, &(endpoint->super));
            endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
            break;
        case MCA_BTL_IB_ADDR_CONNECTED:
            /* so we have the send qp, we just need the recive site.
             * Send request for SRQ numbers */
            BTL_VERBOSE(("The IB addr: sid %" PRIx64 " lid %d"
                        "in MCA_BTL_IB_ADDR_CONNECTED status,"
                        " sending ENDPOINT_XOOB_CONNECT_XRC_REQUEST\n",
                        endpoint->ib_addr->subnet_id,endpoint->ib_addr->lid));
            endpoint->endpoint_state = MCA_BTL_IB_CONNECTING;
            if (OMPI_SUCCESS !=
                    (rc = xoob_send_connect_data(endpoint, ENDPOINT_XOOB_CONNECT_XRC_REQUEST))) {
                BTL_ERROR(("error sending xrc connect request, error code %d", rc));
            }
            break;
        default :
            BTL_ERROR(("Invalid endpoint status %d", endpoint->ib_addr->status));
    }
    OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock);
    return rc;
}
Beispiel #9
0
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device,
                                              mca_btl_ugni_cq_t *cq)
{
    mca_btl_ugni_post_descriptor_t *post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
    gni_cq_entry_t event_data[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP];
    int rc;

    rc = mca_btl_ugni_cq_get_completed_desc (device, cq, event_data, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP);
    if (0 >= rc) {
        return rc;
    }

    BTL_VERBOSE(("got %d completed rdma descriptors", rc));

    for (int i = 0 ; i < rc ; ++i) {
        BTL_VERBOSE(("post descriptor %p complete. GNI_CQ_STATUS_OK(): %d", post_desc[i],
                     GNI_CQ_STATUS_OK(event_data[i])));

        if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data[i]))) {
            uint32_t recoverable = 1;

            (void) GNI_CqErrorRecoverable (event_data[i], &recoverable);

            if (OPAL_UNLIKELY(++post_desc[i]->tries >= mca_btl_ugni_component.rdma_max_retries ||
                              !recoverable)) {
                char char_buffer[1024];
                GNI_CqErrorStr (event_data[i], char_buffer, 1024);
                /* give up */
                BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc[i],
                           recoverable, char_buffer));
#if OPAL_ENABLE_DEBUG
                btl_ugni_dump_post_desc (post_desc[i]);
#endif
                mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_ERROR);

                return OPAL_ERROR;
            }

            mca_btl_ugni_repost (ugni_module, post_desc[i]);

            return 0;
        }

        mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_SUCCESS);
    }

    /* should be resources to progress the pending post list */
    (void) mca_btl_ugni_post_pending (ugni_module, device);

    return rc;
}
Beispiel #10
0
/**
 * Initiate a get operation.
 *
 * @param btl (IN)         BTL module
 * @param endpoint (IN)    BTL addressing information
 * @param descriptor (IN)  Description of the data to be transferred
 */
int mca_btl_ugni_get (struct mca_btl_base_module_t *btl,
                      struct mca_btl_base_endpoint_t *endpoint,
                      struct mca_btl_base_descriptor_t *des) {
    mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) des;
    mca_btl_ugni_segment_t *src_seg = (mca_btl_ugni_segment_t *) des->des_remote;
    mca_btl_ugni_segment_t *dst_seg = (mca_btl_ugni_segment_t *) des->des_local;
    size_t size = src_seg->base.seg_len - src_seg->extra_byte_count;
    bool check;

    BTL_VERBOSE(("Using RDMA/FMA Get"));

    /* cause endpoint to bind if it isn't already (bind is sufficient for rdma) */
    (void) mca_btl_ugni_check_endpoint_state(endpoint);

    /* Check if the get is aligned/sized on a multiple of 4 */
    check = !!((des->des_remote->seg_addr.lval | des->des_local->seg_addr.lval | size) & 3);

    if (OPAL_UNLIKELY(check || size > mca_btl_ugni_component.ugni_get_limit)) {
        /* switch to put */
        return OPAL_ERR_NOT_AVAILABLE;
    }

    if (src_seg->extra_byte_count) {
        memmove ((char *) dst_seg->base.seg_addr.pval + size, src_seg->extra_bytes, src_seg->extra_byte_count);
        src_seg->base.seg_len = size;
        dst_seg->base.seg_len = size;
    }

    des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;

    return mca_btl_ugni_post (frag, true, dst_seg, src_seg);
}
Beispiel #11
0
int mca_btl_ugni_ep_connect_progress (mca_btl_base_endpoint_t *ep) {
    int rc;

    BTL_VERBOSE(("progressing connection for endpoint %p with state %d", (void *)ep, ep->state));

    if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
        return OPAL_SUCCESS;
    }

    if (MCA_BTL_UGNI_EP_STATE_RDMA >= ep->state) {
        rc = mca_btl_ugni_ep_connect_start (ep);
        if (OPAL_SUCCESS != rc) {
            return rc;
        }
    }

    if (GNI_SMSG_TYPE_INVALID == ep->remote_attr.smsg_attr.msg_type) {
        /* use datagram to exchange connection information with the remote peer */
        rc = mca_btl_ugni_directed_ep_post (ep);
        if (OPAL_SUCCESS == rc) {
            rc = OPAL_ERR_RESOURCE_BUSY;
        }
        return rc;
    }

    return mca_btl_ugni_ep_connect_finish (ep);
}
static int mca_btl_ud_modex_send(void)
{
    int rc;
    size_t i;
    size_t size;
    mca_btl_ud_addr_t* addrs = NULL;

    size = mca_btl_ofud_component.num_btls * sizeof(mca_btl_ud_addr_t);
    if(size != 0) {
        addrs = (mca_btl_ud_addr_t*)malloc(size);
        if(NULL == addrs) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }

        for(i = 0; i < mca_btl_ofud_component.num_btls; i++) {
            mca_btl_ud_module_t* btl = &mca_btl_ofud_component.ud_btls[i];
            addrs[i] = btl->addr;
    
            BTL_VERBOSE((0, "modex_send QP num %x, LID = %x",
              addrs[i].qp_num, addrs[i].lid));
        }
    }

    rc = ompi_modex_send(
            &mca_btl_ofud_component.super.btl_version, addrs, size);
    if(NULL != addrs) {
        free(addrs);
    }
    return rc;
}
int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnect) {
    gni_return_t rc;

    do {
        if (MCA_BTL_UGNI_EP_STATE_INIT == ep->state) {
            /* nothing to do */
            break;
        }

        if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state && send_disconnect) {
            rc = GNI_SmsgSendWTag (ep->smsg_ep_handle, NULL, 0, NULL, 0, -1,
                                   MCA_BTL_UGNI_TAG_DISCONNECT);
            if (GNI_RC_SUCCESS != rc) {
                BTL_VERBOSE(("btl/ugni could not send close message"));
            }

            /* we might want to wait for local completion here (do we even care) */
        }

        (void) ompi_common_ugni_ep_destroy (&ep->smsg_ep_handle);
        (void) ompi_common_ugni_ep_destroy (&ep->rdma_ep_handle);

        OMPI_FREE_LIST_RETURN(&ep->btl->smsg_mboxes, ((ompi_free_list_item_t *) ep->mailbox));
        ep->mailbox = NULL;

        ep->state = MCA_BTL_UGNI_EP_STATE_INIT;
    } while (0);

    return OMPI_SUCCESS;
}
Beispiel #14
0
/**
 * Prepare the dst buffer
 *
 * @param btl (IN)      BTL module
 * @param peer (IN)     BTL peer addressing
 * prepare dest's behavior depends on the following: 
 * Has a valid memory registration been passed to prepare_src? 
 *    if so we attempt to use the pre-registred user-buffer, if the memory registration 
 *    is to small (only a portion of the user buffer) then we must reregister the user buffer 
 * Has the user requested the memory to be left pinned? 
 *    if so we insert the memory registration into a memory tree for later lookup, we 
 *    may also remove a previous registration if a MRU (most recently used) list of 
 *    registions is full, this prevents resources from being exhausted.
 */
mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
    struct mca_btl_base_module_t* btl,
    struct mca_btl_base_endpoint_t* endpoint,
    mca_mpool_base_registration_t* registration,
    struct ompi_convertor_t* convertor,
    size_t reserve,
    size_t* size)
{
    mca_btl_openib_module_t *openib_btl;
    mca_btl_openib_frag_t *frag;
    mca_btl_openib_reg_t *openib_reg;
    int rc;
    ptrdiff_t lb;

    openib_btl = (mca_btl_openib_module_t*)btl;
    
    MCA_BTL_IB_FRAG_ALLOC_RECV_FRAG(btl, frag, rc);
    if(NULL == frag) {
        return NULL;
    }
    
    ompi_ddt_type_lb(convertor->pDesc, &lb);
    frag->segment.seg_addr.pval = convertor->pBaseBuf + lb +
        convertor->bConverted;

    if(NULL == registration){
        /* we didn't get a memory registration passed in, so we have to
         * register the region ourselves
         */ 
        rc = btl->btl_mpool->mpool_register(btl->btl_mpool,
                frag->segment.seg_addr.pval, *size, 0, &registration);
        if(OMPI_SUCCESS != rc || NULL == registration) {
            MCA_BTL_IB_FRAG_RETURN(openib_btl, frag);
            return NULL;
        }
        /* keep track of the registration we did */
        frag->registration = (mca_btl_openib_reg_t*)registration;
    }
    openib_reg = (mca_btl_openib_reg_t*)registration;

    frag->sg_entry.length = *size;
    frag->sg_entry.lkey = openib_reg->mr->lkey;
    frag->sg_entry.addr = (unsigned long) frag->segment.seg_addr.pval;

    frag->segment.seg_len = *size;
    frag->segment.seg_key.key32[0] = openib_reg->mr->rkey;

    frag->base.des_dst = &frag->segment;
    frag->base.des_dst_cnt = 1;
    frag->base.des_src = NULL;
    frag->base.des_src_cnt = 0;
    frag->base.des_flags = 0;

    BTL_VERBOSE(("frag->sg_entry.lkey = %lu .addr = %llu "
                "frag->segment.seg_key.key32[0] = %lu",
                frag->sg_entry.lkey, frag->sg_entry.addr,
                frag->segment.seg_key.key32[0]));

    return &frag->base;
}
/*
 * Look for an existing TCP process instance based on the globally unique
 * process identifier.
 */
mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t *name)
{
    mca_btl_tcp_proc_t* proc = NULL;

    OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock);
    opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs,
                              *name, (void**)&proc);
    OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
    if (OPAL_UNLIKELY(NULL == proc)) {
        mca_btl_base_endpoint_t *endpoint;
        opal_proc_t *opal_proc;

        BTL_VERBOSE(("adding tcp proc for unknown peer {.jobid = 0x%x, .vpid = 0x%x}",
                     name->jobid, name->vpid));

        opal_proc = opal_proc_for_name (*name);
        if (NULL == opal_proc) {
            return NULL;
        }

        /* try adding this proc to each btl until */
        for( uint32_t i = 0; i < mca_btl_tcp_component.tcp_num_btls; ++i ) {
            endpoint = NULL;
            (void) mca_btl_tcp_add_procs (&mca_btl_tcp_component.tcp_btls[i]->super, 1, &opal_proc,
                                          &endpoint, NULL);
            if (NULL != endpoint && NULL == proc) {
                /* get the proc and continue on (could probably just break here) */
                proc = endpoint->endpoint_proc;
            }
        }
    }

    return proc;
}
Beispiel #16
0
static int mca_btl_ugni_endpoint_get_modex (mca_btl_base_endpoint_t *ep)
{
    mca_btl_ugni_modex_t *modex;
    size_t msg_size;
    int rc;

    assert (NULL != ep && NULL != ep->peer_proc);

    /* Receive the modex */
    OPAL_MODEX_RECV(rc, &mca_btl_ugni_component.super.btl_version,
                    &ep->peer_proc->proc_name, (void **)&modex, &msg_size);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        BTL_ERROR(("error receiving modex"));
        return rc;
    }

    ep->ep_rem_addr = modex->addr;
    ep->ep_rem_id = modex->id;


    BTL_VERBOSE(("received modex for ep %p. addr: %d, id: %d",  (void*)ep, ep->ep_rem_addr, ep->ep_rem_id));

    free (modex);

    return OPAL_SUCCESS;
}
Beispiel #17
0
static inline int
mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device)
{
    int pending_post_count = opal_list_get_size (&device->pending_post);
    mca_btl_ugni_post_descriptor_t *post_desc;
    int rc;

    /* check if there are any posts pending resources */
    if (OPAL_LIKELY(0 == pending_post_count)) {
        return 0;
    }

    BTL_VERBOSE(("progressing %d pending FMA/RDMA operations", pending_post_count));
    for (int i = 0 ; i < pending_post_count ; ++i) {
        mca_btl_ugni_device_lock (device);
        post_desc = (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&device->pending_post);
        mca_btl_ugni_device_unlock (device);
        if (NULL == post_desc) {
            break;
        }
        rc = mca_btl_ugni_repost (ugni_module, post_desc);
        if (OPAL_SUCCESS != rc) {
            mca_btl_ugni_device_lock (device);
            opal_list_prepend (&device->pending_post, (opal_list_item_t *) post_desc);
            mca_btl_ugni_device_unlock (device);
            break;
        }
    }

    return 1;
}
/*
 * Reply to a `start - connect' message
 */
static int reply_start_connect(mca_btl_openib_endpoint_t *endpoint,
                               mca_btl_openib_rem_info_t *rem_info)
{
    int rc;

    BTL_VERBOSE(("Initialized QPs, LID = %d",
                 ((mca_btl_openib_module_t*)endpoint->endpoint_btl)->lid));

    /* Create local QP's and post receive resources */
    if (OMPI_SUCCESS != (rc = qp_create_all(endpoint))) {
        return rc;
    }

    /* Set the remote side info */
    set_remote_info(endpoint, rem_info);
    
    /* Connect to remote endpoint qp's */
    if (OMPI_SUCCESS != (rc = qp_connect_all(endpoint))) {
        return rc;
    }

    /* Send connection info over to remote endpoint */
    endpoint->endpoint_state = MCA_BTL_IB_CONNECT_ACK;
    if (OMPI_SUCCESS !=
        (rc = send_connect_data(endpoint, ENDPOINT_CONNECT_RESPONSE))) {
        BTL_ERROR(("error in endpoint send connect request error code is %d",
                   rc));
        return rc;
    }
    return OMPI_SUCCESS;
}
Beispiel #19
0
static inline int
mca_btl_ugni_handle_remote_smsg_overrun (mca_btl_ugni_module_t *btl)
{
    gni_cq_entry_t event_data;
    unsigned int ep_index;
    int count, rc;

    BTL_VERBOSE(("btl/ugni_component detected SMSG CQ overrun. "
                 "processing message backlog..."));

    /* we don't know which endpoint lost an smsg completion. clear the
       smsg remote cq and check all mailboxes */

    /* clear out remote cq */
    do {
        rc = GNI_CqGetEvent (btl->smsg_remote_cq, &event_data);
    } while (GNI_RC_NOT_DONE != rc);

    for (ep_index = 0, count = 0 ; ep_index < btl->endpoint_count ; ++ep_index) {
        mca_btl_base_endpoint_t *ep = btl->endpoints[ep_index];

        if (NULL == ep || MCA_BTL_UGNI_EP_STATE_CONNECTED != ep->state) {
            continue;
        }

        /* clear out smsg mailbox */
        rc = mca_btl_ugni_smsg_process (ep);
        if (OPAL_LIKELY(rc >= 0)) {
            count += rc;
        }
    }

    return count;
}
void mca_btl_openib_load_apm_xrc_rcv(uint32_t qp_num, mca_btl_openib_endpoint_t *ep)
{
    struct ibv_qp_init_attr qp_init_attr;
    struct ibv_qp_attr attr;
    enum ibv_qp_attr_mask mask = 0;
    struct mca_btl_openib_module_t *btl;

    BTL_VERBOSE(("APM XRC: Loading alternative path"));
    assert (NULL != ep);
    btl = ep->endpoint_btl;

    if (ibv_query_xrc_rcv_qp(btl->device->xrc_domain, qp_num, &attr, mask, &qp_init_attr))
        BTL_ERROR(("Failed to ibv_query_qp, qp num: %d", qp_num));

    if (mca_btl_openib_component.apm_lmc &&
            attr.ah_attr.src_path_bits - btl->src_path_bits < mca_btl_openib_component.apm_lmc) {
        apm_update_attr(&attr, &mask);
    } else {
        if (mca_btl_openib_component.apm_ports) {
            /* Try to migrate to next port */
            if (OPAL_SUCCESS != apm_update_port(ep, &attr, &mask))
                return;
        } else {
            BTL_ERROR(("Failed to load alternative path, all %d were used",
                        attr.ah_attr.src_path_bits - btl->src_path_bits));
        }
    }

    ibv_modify_xrc_rcv_qp(btl->device->xrc_domain, qp_num, &attr, mask);
    /* Maybe the qp already was modified by other process - ignoring error */
}
/* Find endpoint for specific subnet/lid/message */
static mca_btl_openib_endpoint_t* xoob_find_endpoint(ompi_process_name_t* process_name,
        uint64_t subnet_id, uint16_t lid, uint8_t message_type)
{
    size_t i;
    mca_btl_openib_proc_t *ib_proc;
    mca_btl_openib_endpoint_t *ib_endpoint = NULL;
    bool found = false;

    BTL_VERBOSE(("Searching for ep and proc with follow parameters:"
                "jobid %d, vpid %d, "
                "sid %" PRIx64 ", lid %d",
                process_name->jobid, process_name->vpid,
                subnet_id, lid));


    /* find ibproc */
    OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
    for (ib_proc = (mca_btl_openib_proc_t*)
            opal_list_get_first(&mca_btl_openib_component.ib_procs);
            ib_proc != (mca_btl_openib_proc_t*)
            opal_list_get_end(&mca_btl_openib_component.ib_procs);
            ib_proc  = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) {
        if (OPAL_EQUAL == ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
                    &ib_proc->proc_ompi->proc_name, process_name)) {
            found = true;
            break;
        }
    }
    /* we found our ib_proc, lets find endpoint now */
    if (found) {
        for (i = 0; i < ib_proc->proc_endpoint_count; i++) {
            ib_endpoint = ib_proc->proc_endpoints[i];
            /* we need to check different
             * lid for different message type */
            if (ENDPOINT_XOOB_CONNECT_RESPONSE == message_type ||
                    ENDPOINT_XOOB_CONNECT_XRC_RESPONSE == message_type) {
                /* response message */
                if (ib_endpoint->subnet_id == subnet_id &&
                        ib_endpoint->ib_addr->lid == lid) {
                    break; /* Found one */
                }
            } else {
                /* request message */
                if (ib_endpoint->subnet_id == subnet_id &&
                        ib_endpoint->endpoint_btl->lid == lid) {
                    break; /* Found one */
                }
            }
        }
        if (NULL == ib_endpoint) {
                BTL_ERROR(("can't find suitable endpoint for this peer\n"));
        }
    } else {
            BTL_ERROR(("can't find suitable endpoint for this peer\n"));
    }
    OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
    return ib_endpoint;
}
Beispiel #22
0
int mca_btl_scif_module_init (void)
{
    int rc;

    /* create an endpoint to listen for connections */
    mca_btl_scif_module.scif_fd = scif_open ();
    if (-1 == mca_btl_scif_module.scif_fd) {
        BTL_VERBOSE(("scif_open failed. errno = %d", errno));
        return OPAL_ERROR;
    }

    /* bind the endpoint to a port */
    mca_btl_scif_module.port_id.port = scif_bind (mca_btl_scif_module.scif_fd, 0);
    if (-1 == mca_btl_scif_module.port_id.port) {
        BTL_VERBOSE(("scif_bind failed. errno = %d", errno));
        scif_close (mca_btl_scif_module.scif_fd);
        mca_btl_scif_module.scif_fd = -1;
        return OPAL_ERROR;
    }

    /* determine this processes node id */
    rc = scif_get_nodeIDs (NULL, 0, &mca_btl_scif_module.port_id.node);
    if (-1 == rc) {
        BTL_VERBOSE(("btl/scif error getting node id of this node"));
        return OPAL_ERROR;
    }

    /* Listen for connections */
    /* TODO - base the maximum backlog off something */
    rc = scif_listen (mca_btl_scif_module.scif_fd, 64);
    if (-1 == rc) {
        BTL_VERBOSE(("scif_listen failed. errno = %d", errno));
        scif_close (mca_btl_scif_module.scif_fd);
        mca_btl_scif_module.scif_fd = -1;
        return OPAL_ERROR;
    }

    BTL_VERBOSE(("btl/scif: listening @ port %u on node %u\n",
                 mca_btl_scif_module.port_id.port, mca_btl_scif_module.port_id.node));

    OBJ_CONSTRUCT(&mca_btl_scif_module.dma_frags, opal_free_list_t);
    OBJ_CONSTRUCT(&mca_btl_scif_module.eager_frags, opal_free_list_t);

    return OPAL_SUCCESS;
}
Beispiel #23
0
int mca_btl_ugni_progress_remote_smsg (mca_btl_ugni_module_t *btl)
{
    mca_btl_base_endpoint_t *ep;
    gni_cq_entry_t event_data;
    gni_return_t grc;
    uint64_t inst_id;

    grc = mca_btl_ugni_gni_cq_get_event (btl->devices, btl->smsg_remote_cq, &event_data);
    if (GNI_RC_NOT_DONE == grc) {
        return 0;
    }

    if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data) ||
                      GNI_CQ_OVERRUN(event_data))) {
        if (GNI_RC_ERROR_RESOURCE == grc ||
            (GNI_RC_SUCCESS == grc && GNI_CQ_OVERRUN(event_data))) {
            /* recover from smsg cq overrun */
            return mca_btl_ugni_handle_remote_smsg_overrun (btl);
        }

        BTL_ERROR(("unhandled error in GNI_CqGetEvent"));

        /* unhandled error: crash */
        assert (0);
        return mca_btl_rc_ugni_to_opal (grc);
    }

    BTL_VERBOSE(("REMOTE CQ: Got event 0x%" PRIx64 ". msg id = %" PRIu64
                 ". ok = %d, type = %" PRIu64, (uint64_t) event_data,
                 GNI_CQ_GET_INST_ID(event_data), GNI_CQ_STATUS_OK(event_data),
                 GNI_CQ_GET_TYPE(event_data)));

    inst_id = GNI_CQ_GET_INST_ID(event_data);

    ep = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&btl->endpoints, inst_id);

    if (OPAL_UNLIKELY(MCA_BTL_UGNI_EP_STATE_CONNECTED != ep->state)) {
        /* due to the nature of datagrams we may get a smsg completion before
           we get mailbox info from the peer */
        BTL_VERBOSE(("event occurred on an unconnected endpoint! ep state = %d", ep->state));
        return 0;
    }

    return mca_btl_ugni_smsg_process (ep);
}
/**
 * Initiate a get operation.
 *
 * @param btl (IN)         BTL module
 * @param endpoint (IN)    BTL addressing information
 * @param descriptor (IN)  Description of the data to be transferred
 */
int mca_btl_scif_get (struct mca_btl_base_module_t *btl,
                      struct mca_btl_base_endpoint_t *endpoint,
                      struct mca_btl_base_descriptor_t *des) {
    mca_btl_scif_segment_t *src = (mca_btl_scif_segment_t *) des->des_src;
    mca_btl_scif_segment_t *dst = (mca_btl_scif_segment_t *) des->des_dst;
    size_t len = lmin (src->base.seg_len, dst->base.seg_len);
    int rc, mark, flags = 0;
    off_t roffset, loffset;
    size_t to_get;
#if defined(SCIF_TIMING)
    struct timespec ts;

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);

    mca_btl_scif_component.get_count++;
#endif

    BTL_VERBOSE(("Using DMA Get for frag %p from offset %lu", (void *) des,
                 (unsigned long) src->scif_offset));

    roffset = src->scif_offset + (off_t)(src->orig_ptr - src->base.seg_addr.lval);
    loffset = dst->scif_offset + (off_t)(dst->orig_ptr - dst->base.seg_addr.lval);
        
    if (mca_btl_scif_component.rma_use_cpu) {
        flags = SCIF_RMA_USECPU;
    }

    if (mca_btl_scif_component.rma_sync) {
        flags |= SCIF_RMA_SYNC;
    }

    /* start the read */
    rc = scif_readfrom (endpoint->scif_epd, loffset, len, roffset, flags);
    if (OPAL_UNLIKELY(-1 == rc)) {
        return OMPI_ERROR;
    }

    /* always call the callback function */
    des->des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK; 

    if (!(flags & SCIF_RMA_SYNC)) {
        /* according to the scif documentation is is better to use a fence rather
         * than using the SCIF_RMA_SYNC flag with scif_readfrom */
        scif_fence_mark (endpoint->scif_epd, SCIF_FENCE_INIT_SELF, &mark);
        scif_fence_wait (endpoint->scif_epd, mark);
    }

#if defined(SCIF_TIMING)
    SCIF_UPDATE_TIMER(mca_btl_scif_component.get_time,
                      mca_btl_scif_component.get_time_max, ts);
#endif

    /* since we completed the fence the RMA operation is complete */
    mca_btl_scif_frag_complete ((mca_btl_scif_base_frag_t *) des, OMPI_SUCCESS);

    return OMPI_SUCCESS;
}
Beispiel #25
0
int mca_btl_ugni_ep_disconnect (mca_btl_base_endpoint_t *ep, bool send_disconnect)
{
    mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_ep_btl (ep);
    mca_btl_ugni_device_t *device;
    int rc;

    if (MCA_BTL_UGNI_EP_STATE_INIT == ep->state) {
        /* nothing to do */
        return OPAL_SUCCESS;
    }

    device = ep->smsg_ep_handle.device;

    while (device->dev_smsg_local_cq.active_operations) {
        /* ensure all sends are complete before removing and procs */
        rc = mca_btl_ugni_progress_local_smsg (ugni_module, device);
        if (OPAL_SUCCESS != rc) {
            break;
        }
    }

    if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state && send_disconnect) {
        rc = mca_btl_ugni_ep_send_disconnect (ep);
        if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
            BTL_VERBOSE(("could not send disconnect message to peer"));
        }

        /* wait for the disconnect messagse to go */
        do {
            /* ensure all sends are complete before removing and procs */
            rc = mca_btl_ugni_progress_local_smsg (ugni_module, device);
            if (OPAL_SUCCESS != rc) {
                break;
            }
        } while (device->dev_smsg_local_cq.active_operations);

        (void) opal_atomic_add_fetch_32 (&ep->smsg_ep_handle.device->smsg_connections, -1);
    }

    mca_btl_ugni_device_lock (device);

    /* NTH: this call may not need the device lock. seems to work without it but
     * the lock is here to be safe. */
    (void) mca_btl_ugni_ep_handle_cleanup (&ep->smsg_ep_handle);

    mca_btl_ugni_device_unlock (device);

    if (ep->mailbox) {
        opal_free_list_return (&ugni_module->smsg_mboxes, ((opal_free_list_item_t *) ep->mailbox));
        ep->mailbox = NULL;
    }

    ep->state = MCA_BTL_UGNI_EP_STATE_INIT;

    return OPAL_SUCCESS;
}
Beispiel #26
0
static void mca_btl_ugni_callback_eager_get (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
                                             struct mca_btl_base_descriptor_t *desc, int rc)
{
    mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl;
    mca_btl_ugni_base_frag_t *frag = (mca_btl_ugni_base_frag_t *) desc;
    uint32_t len = frag->hdr.eager.send.lag & 0x00ffffff;
    uint8_t tag = frag->hdr.eager.send.lag >> 24;
    size_t payload_len = frag->hdr.eager.src_seg.base.seg_len;
    size_t hdr_len = len - payload_len;
    mca_btl_active_message_callback_t *reg;
    mca_btl_base_segment_t segs[2];
    mca_btl_ugni_base_frag_t tmp;

    BTL_VERBOSE(("eager get for rem_ctx %p complete", frag->hdr.eager.ctx));

    tmp.base.des_local = segs;
    if (hdr_len) {
        tmp.base.des_local_count = 2;

        segs[0].seg_addr.pval = frag->hdr.eager_ex.pml_header;
        segs[0].seg_len       = hdr_len;
        segs[1].seg_addr.pval = frag->segments[0].base.seg_addr.pval;
        segs[1].seg_len       = payload_len;
    } else {
        tmp.base.des_local_count = 1;

        segs[0].seg_addr.pval = frag->segments[0].base.seg_addr.pval;
        segs[0].seg_len       = payload_len;
    }

    reg = mca_btl_base_active_message_trigger + tag;
    reg->cbfunc(&frag->endpoint->btl->super, tag, &(tmp.base), reg->cbdata);

    frag->hdr.rdma.ctx = frag->hdr.eager.ctx;

    /* once complete use this fragment for a pending eager get if any exist */
    frag->base.des_cbfunc = mca_btl_ugni_callback_eager_get_progress_pending;

    /* tell the remote peer the operation is complete */
    rc = opal_mca_btl_ugni_smsg_send (frag, &frag->hdr.rdma, sizeof (frag->hdr.rdma),
                                      NULL, 0, MCA_BTL_UGNI_TAG_RDMA_COMPLETE);
    if (OPAL_UNLIKELY(0 > rc)) {
        /* queue fragment */
        if (false == endpoint->wait_listed) {
            OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
            opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
            OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
            endpoint->wait_listed = true;
        }

        OPAL_THREAD_LOCK(&endpoint->lock);
        opal_list_append (&endpoint->frag_wait_list, (opal_list_item_t *) frag);
        OPAL_THREAD_UNLOCK(&endpoint->lock);
    }
}
Beispiel #27
0
static inline int
mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module)
{
    ompi_common_ugni_post_desc_t *desc;
    mca_btl_ugni_base_frag_t *frag;
    gni_cq_entry_t event_data = 0;
    uint32_t recoverable = 1;
    gni_return_t rc;

    rc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data);
    if (GNI_RC_NOT_DONE == rc) {
        return 0;
    }

    if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) {
        /* TODO -- need to handle overrun -- how do we do this without an event?
           will the event eventually come back? Ask Cray */
        BTL_ERROR(("unhandled post error! ugni rc = %d", rc));
        assert (0);
        return ompi_common_rc_ugni_to_ompi (rc);
    }

    rc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, (gni_post_descriptor_t **) &desc);
    if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) {
        BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc]));
        return ompi_common_rc_ugni_to_ompi (rc);
    }

    frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc);

    if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) {
        (void) GNI_CqErrorRecoverable (event_data, &recoverable);

        if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries ||
                          !recoverable)) {
            /* give up */
            BTL_ERROR(("giving up on frag %p", (void *) frag));
            frag->cbfunc (frag, OMPI_ERROR);

            return OMPI_ERROR;
        }

        /* repost transaction */
        mca_btl_ugni_repost (frag, OMPI_SUCCESS);

        return 0;
    }

    BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag));

    frag->cbfunc (frag, ompi_common_rc_ugni_to_ompi (rc));

    return 1;
}
static inline int
mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module)
{
    uint32_t remote_addr, remote_id;
    mca_btl_base_endpoint_t *ep;
    gni_post_state_t post_state;
    gni_ep_handle_t handle;
    uint64_t datagram_id;
    gni_return_t grc;
    int count = 0;

    /* check for datagram completion */
    grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id);
    if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) {
        return 0;
    }

    if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) ==
        MCA_BTL_UGNI_CONNECT_WILDCARD_ID) {
        handle = ugni_module->wildcard_ep;
    } else {
        handle =
            ugni_module->endpoints[(uint32_t)(datagram_id & 0xffffffffull)]->smsg_ep_handle;
    }

    /* wait for the incoming datagram to complete (in case it isn't) */
    grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state,
                                  &remote_addr, &remote_id);
    if (GNI_RC_SUCCESS != grc) {
        BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc));
        return ompi_common_rc_ugni_to_ompi (grc);
    }

    BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, "
                 "peer = %d", datagram_id, post_state, remote_id));

    ep = ugni_module->endpoints[remote_id];

    /* NTH: TODO -- error handling */
    (void) mca_btl_ugni_ep_connect_progress (ep);

    if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) {
        /*  process messages waiting in the endpoint's smsg mailbox */
        count = mca_btl_ugni_smsg_process (ep);
    }

    /* repost the wildcard datagram */
    if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) ==
        MCA_BTL_UGNI_CONNECT_WILDCARD_ID) {
        mca_btl_ugni_wildcard_ep_post (ugni_module);
    }

    return count;
}
static int apm_update_port(mca_btl_openib_endpoint_t *ep,
        struct ibv_qp_attr *attr, enum ibv_qp_attr_mask *mask)
{
    size_t port_i;
    uint16_t apm_lid = 0;

    if (attr->port_num == ep->endpoint_btl->apm_port) {
        /* all ports were used */
        BTL_ERROR(("APM: already all ports were used port_num %d apm_port %d",
                    attr->port_num, ep->endpoint_btl->apm_port));
        return OPAL_ERROR;
    }
    /* looking for alternatve lid on remote site */
    for(port_i = 0; port_i < ep->endpoint_proc->proc_port_count; port_i++) {
        if (ep->endpoint_proc->proc_ports[port_i].pm_port_info.lid == attr->ah_attr.dlid - mca_btl_openib_component.apm_lmc) {
            apm_lid = ep->endpoint_proc->proc_ports[port_i].pm_port_info.apm_lid;
        }
    }
    if (0 == apm_lid) {
        /* APM was disabled on one of site ? */
        BTL_VERBOSE(("APM: Was disabled ? dlid %d %d %d", attr->ah_attr.dlid, attr->ah_attr.src_path_bits, ep->endpoint_btl->src_path_bits));
        return OPAL_ERROR;
    }
    /* We guess cthat the LMC is the same on all ports */
    attr->alt_ah_attr.static_rate = attr->ah_attr.static_rate;
    attr->alt_ah_attr.sl = attr->ah_attr.sl;
    attr->alt_pkey_index = attr->pkey_index;
    attr->alt_timeout = attr->timeout;
    attr->path_mig_state = IBV_MIG_REARM;
    *mask = IBV_QP_ALT_PATH|IBV_QP_PATH_MIG_STATE;

    attr->alt_port_num = ep->endpoint_btl->apm_port;
    attr->alt_ah_attr.src_path_bits = ep->endpoint_btl->src_path_bits;
    attr->alt_ah_attr.dlid = apm_lid;

    BTL_VERBOSE(("New APM port loaded: alt_src_port:%d, dlid: %d, src_bits: %d:%d, old_dlid %d",
                attr->alt_port_num, attr->alt_ah_attr.dlid,
                attr->ah_attr.src_path_bits, attr->alt_ah_attr.src_path_bits,
                attr->ah_attr.dlid));
    return OPAL_SUCCESS;
}
Beispiel #30
0
/*
 * Set remote connection info
 *
 * XXX: Currently size is unutilized, this shall change
 * as soon as we add more info to be exchanged at connection
 * setup.
 *
 */
static int mca_btl_mvapi_endpoint_set_remote_info(mca_btl_base_endpoint_t* endpoint, mca_btl_mvapi_rem_info_t* rem_info)
{
    
    memcpy(&((mca_btl_mvapi_endpoint_t*) endpoint)->rem_info, rem_info, sizeof(mca_btl_mvapi_rem_info_t)); 
    
    BTL_VERBOSE(("Setting High Priority QP num = %d, Low Priority QP num %d,  LID = %d",
                 endpoint->rem_info.rem_qp_num_hp,
                 endpoint->rem_info.rem_qp_num_lp, 
                 endpoint->rem_info.rem_lid));

    return ORTE_SUCCESS;
}