示例#1
0
/*
 * Function used for debugging problems in eager rdma.
 */
static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint) {
    mca_btl_openib_recv_frag_t *headers_buf = endpoint->eager_rdma_local.frags;
    mca_btl_openib_recv_frag_t * frag;
    mca_btl_openib_control_header_t* chdr;
    int i, size;

    opal_output(0, "Head = %d", endpoint->eager_rdma_local.head);

    for (i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
        frag = &headers_buf[i];
        size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr);

        frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) -
               size + sizeof(mca_btl_openib_footer_t));
        to_base_frag(frag)->segment.base.seg_addr.pval =
               ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t);

        chdr = to_base_frag(frag)->segment.base.seg_addr.pval;
        if ((MCA_BTL_TAG_IB == frag->hdr->tag) &&
            (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type)) {
            opal_output(0, "tag[%d] is credit message", i);
        } else {
            opal_output(0, "frag[%d] size=%d,tag=%d,ftr->u.buf=%d", i, size, frag->hdr->tag,
                        frag->ftr->u.buf[3]);
        }
    }
}
示例#2
0
static int acquire_send_credit(mca_btl_wv_endpoint_t *endpoint,
        mca_btl_wv_send_frag_t *frag)
{
    mca_btl_wv_module_t *wv_btl = endpoint->endpoint_btl;
    int qp = to_base_frag(frag)->base.order;
    int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);

    if(BTL_WV_QP_TYPE_PP(qp)) {
        if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
            opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio],
                    (opal_list_item_t *)frag);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    } else {
        if(OPAL_THREAD_ADD32(&wv_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0)
        {
            OPAL_THREAD_ADD32(&wv_btl->qps[qp].u.srq_qp.sd_credits, 1);
            OPAL_THREAD_LOCK(&wv_btl->ib_lock);
            opal_list_append(&wv_btl->qps[qp].u.srq_qp.pending_frags[prio],
                             (opal_list_item_t *)frag);
            OPAL_THREAD_UNLOCK(&wv_btl->ib_lock);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }

    return OMPI_SUCCESS;
}
static void send_control_constructor(mca_btl_openib_send_control_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_CONTROL;
    /* adjusting headers because there is no coalesce header in control messages */
    frag->hdr = frag->chdr;
    to_base_frag(frag)->segment.seg_addr.pval = frag->hdr + 1;
    to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t)frag->hdr;
}
示例#4
0
static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep,
        mca_btl_openib_send_frag_t *frag)
{
    int qp = to_base_frag(frag)->base.order;
    int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);

    if(qp_get_wqe(ep, qp) < 0) {
        qp_put_wqe(ep, qp);
        opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio],
                (opal_list_item_t *)frag);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    return OPAL_SUCCESS;
}
示例#5
0
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
                                 mca_btl_openib_get_frag_t *frag)
{
    int qp = to_base_frag(frag)->base.order;
    struct ibv_send_wr *bad_wr;

    /* check for a send wqe */
    if (qp_get_wqe(ep, qp) < 0) {
        qp_put_wqe(ep, qp);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* check for a get token */
    if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
        qp_put_wqe(ep, qp);
        OPAL_THREAD_ADD32(&ep->get_tokens,1);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
    qp_reset_signal_count(ep, qp);

    if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) {
        qp_put_wqe(ep, qp);
        OPAL_THREAD_ADD32(&ep->get_tokens,1);
        return OPAL_ERROR;
    }

    return OPAL_SUCCESS;
}
示例#6
0
static void in_constructor(mca_btl_wv_in_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);

    base_frag->base.des_dst = &base_frag->segment;
    base_frag->base.des_dst_cnt = 1;
    base_frag->base.des_src = NULL;
    base_frag->base.des_src_cnt = 0;
}
示例#7
0
/* this function is called with endpoint->endpoint_lock held */
int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
        mca_btl_openib_send_frag_t *frag)
{
    int prio = to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY;
    mca_btl_openib_header_t *hdr = frag->hdr;
    mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
    int qp, ib_rc, rc;
    bool do_rdma = false;
    size_t size;

    if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
        des->order = frag->qp_idx;

    qp = des->order;

    if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS)
        return OPAL_ERR_RESOURCE_BUSY;

    size = des->des_segments->seg_len + frag->coalesced_length;

    rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size,
                                                 &do_rdma, frag, true);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        qp_put_wqe(endpoint, qp);
        return OPAL_ERR_RESOURCE_BUSY;
    }

    qp_reset_signal_count(endpoint, qp);
    ib_rc = post_send(endpoint, frag, do_rdma, 1);

    if(!ib_rc)
        return OPAL_SUCCESS;

    if(endpoint->nbo)
        BTL_OPENIB_HEADER_NTOH(*hdr);

    mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag);

    qp_put_wqe(endpoint, qp);

    BTL_ERROR(("error posting send request error %d: %s. size = %lu\n",
               ib_rc, strerror(ib_rc), size));
    return OPAL_ERROR;
}
static void get_constructor(mca_btl_openib_get_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_RECV_USER;

    frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag;
    frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
    frag->sr_desc.num_sge = 1;
    frag->sr_desc.opcode = IBV_WR_RDMA_READ;
    frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
    frag->sr_desc.next = NULL;
}
示例#9
0
static void get_constructor(mca_btl_wv_get_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_WV_FRAG_RECV_USER;

    frag->sr_desc.WrId = (uint64_t)(uintptr_t)frag;
    frag->sr_desc.pSgl = (WV_SGE*)&to_com_frag(frag)->sg_entry;
    frag->sr_desc.nSge = 1;
    frag->sr_desc.Opcode = WvRdmaRead;
    frag->sr_desc.Flags = WV_SEND_SIGNALED;
    frag->sr_desc.pNext = NULL;
}
示例#10
0
static void coalesced_constructor(mca_btl_wv_coalesced_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);

    base_frag->type = MCA_BTL_WV_FRAG_COALESCED;

    base_frag->base.des_src = &base_frag->segment;
    base_frag->base.des_src_cnt = 1;
    base_frag->base.des_dst = NULL;
    base_frag->base.des_dst_cnt = 0;
}
示例#11
0
static void com_constructor(mca_btl_wv_com_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);
    mca_btl_wv_reg_t* reg =
        (mca_btl_wv_reg_t*)base_frag->base.super.registration;

    frag->registration = reg;

    if(reg) {
        frag->sg_entry.Lkey = reg->mr->lkey;
        base_frag->segment.seg_key.key32[0] = reg->mr->lkey;
    }
}
示例#12
0
static void out_constructor(mca_btl_openib_out_frag_t *frag)
{
    mca_btl_openib_frag_t *base_frag = to_base_frag(frag);

    base_frag->base.des_src = &base_frag->segment;
    base_frag->base.des_src_cnt = 1;
    base_frag->base.des_dst = NULL;
    base_frag->base.des_dst_cnt = 0;

    frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag;
    frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
    frag->sr_desc.num_sge = 1;
    frag->sr_desc.opcode = IBV_WR_SEND;
    frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
    frag->sr_desc.next = NULL;
}
示例#13
0
/* send the eager rdma connect message to the remote endpoint */
static int mca_btl_openib_endpoint_send_eager_rdma(
    mca_btl_base_endpoint_t* endpoint)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_eager_rdma_header_t *rdma_hdr;
    mca_btl_openib_send_control_frag_t* frag;
    int rc;

    frag = alloc_control_frag(openib_btl);
    if(NULL == frag) {
        return -1;
    }

    to_base_frag(frag)->base.des_cbfunc =
        mca_btl_openib_endpoint_eager_rdma_connect_cb;
    to_base_frag(frag)->base.des_cbdata = NULL;
    to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
    to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
    to_base_frag(frag)->segment.seg_len =
        sizeof(mca_btl_openib_eager_rdma_header_t);
    to_com_frag(frag)->endpoint = endpoint;

    frag->hdr->tag = MCA_BTL_TAG_IB;
    rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval;
    rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
    rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
    rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval);
    BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64
                 ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n",
                 rdma_hdr->rkey,
                 rdma_hdr->rdma_start.lval,
                 rdma_hdr->rdma_start.pval,
                 rdma_hdr->rdma_start.ival,
                 rdma_hdr->control.type,
                 (int) sizeof(mca_btl_openib_eager_rdma_header_t)
                 ));

    if(endpoint->nbo) {
        BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr));

        BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n",
                     rdma_hdr->rkey,
                     rdma_hdr->rdma_start.lval,
                     rdma_hdr->rdma_start.pval,
                     rdma_hdr->rdma_start.ival
                     ));
    }
    rc = mca_btl_openib_endpoint_send(endpoint, frag);
    if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc)
        return OPAL_SUCCESS;

    MCA_BTL_IB_FRAG_RETURN(frag);
    BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno)));
    return rc;
}
示例#14
0
static void recv_constructor(mca_btl_wv_recv_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);

    base_frag->type = MCA_BTL_WV_FRAG_RECV;

    frag->hdr = (mca_btl_wv_header_t*)base_frag->base.super.ptr;
    base_frag->segment.seg_addr.pval =
        ((unsigned char* )frag->hdr) + sizeof(mca_btl_wv_header_t);
    to_com_frag(frag)->sg_entry.pAddress = (void*)(uintptr_t)frag->hdr;

    frag->rd_desc.wr_id = (uint64_t)(uintptr_t)frag;
    frag->rd_desc.sg_list = (WV_SGE*)&to_com_frag(frag)->sg_entry;
    frag->rd_desc.num_sge = 1;
    frag->rd_desc.next = NULL;
}
示例#15
0
static void out_constructor(mca_btl_wv_out_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);

    base_frag->base.des_src = &base_frag->segment;
    base_frag->base.des_src_cnt = 1;
    base_frag->base.des_dst = NULL;
    base_frag->base.des_dst_cnt = 0;

    frag->sr_desc.WrId = (uint64_t)(uintptr_t)frag;
    frag->sr_desc.pSgl = (WV_SGE*)&to_com_frag(frag)->sg_entry; 
    frag->sr_desc.nSge = 1;
    frag->sr_desc.Opcode = WvSend;
    frag->sr_desc.Flags = WV_SEND_SIGNALED;
    frag->sr_desc.pNext = NULL;
}
示例#16
0
static void send_constructor(mca_btl_wv_send_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);

    base_frag->type = MCA_BTL_WV_FRAG_SEND;

    frag->chdr = (mca_btl_wv_header_t*)base_frag->base.super.ptr;
    frag->hdr = (mca_btl_wv_header_t*)
        (((unsigned char*)base_frag->base.super.ptr) +
        sizeof(mca_btl_wv_header_coalesced_t) +
        sizeof(mca_btl_wv_control_header_t));
    base_frag->segment.seg_addr.pval = frag->hdr + 1;
    to_com_frag(frag)->sg_entry.pAddress = (void*)(uintptr_t)frag->hdr;
    frag->coalesced_length = 0;
    OBJ_CONSTRUCT(&frag->coalesced_frags, opal_list_t);
}
示例#17
0
/*
 * Attempt to send a fragment using a given endpoint. If the endpoint is not
 * connected, queue the fragment and start the connection as required.
 */
int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* ep,
                                 mca_btl_openib_send_frag_t* frag)
{
    int rc;

    OPAL_THREAD_LOCK(&ep->endpoint_lock);
    rc = check_endpoint_state(ep, &to_base_frag(frag)->base,
            &ep->pending_lazy_frags);

    if(OPAL_LIKELY(OPAL_SUCCESS == rc)) {
        rc = mca_btl_openib_endpoint_post_send(ep, frag);
    }
    OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
    if (OPAL_UNLIKELY(OPAL_ERR_RESOURCE_BUSY == rc)) {
        rc = OPAL_SUCCESS;
    }

    return rc;
}
示例#18
0
void mca_btl_wv_frag_init(ompi_free_list_item_t* item, void* ctx)
{
    mca_btl_wv_frag_init_data_t* init_data = (mca_btl_wv_frag_init_data_t *) ctx;
    mca_btl_wv_frag_t *frag = to_base_frag(item);

    if(MCA_BTL_WV_FRAG_RECV == frag->type) {
        to_recv_frag(frag)->qp_idx = init_data->order;
        to_com_frag(frag)->sg_entry.Length =
            mca_btl_wv_component.qp_infos[init_data->order].size +
            sizeof(mca_btl_wv_header_t) +
            sizeof(mca_btl_wv_header_coalesced_t) +
            sizeof(mca_btl_wv_control_header_t);
    }

    if(MCA_BTL_WV_FRAG_SEND == frag->type)
        to_send_frag(frag)->qp_idx = init_data->order;

    frag->list = init_data->list;
}
示例#19
0
/**
 * This function is called when we get an error on the completion
 * event of a fragment.  We check to see what type of fragment it is
 * and act accordingly.  In most cases, we first call up into the PML
 * and have it map out this connection for any future communication.
 * In addition, this function will possibly send some control messages
 * over the other openib BTL.  The first control message will tell the
 * remote side to also map out this connection.  The second control
 * message makes sure the eager RDMA connection remains in a sane
 * state.  See that function for more details.
 * @param openib_btl Pointer to BTL that had the error
 * @param des Pointer to descriptor that had the error
 * @param qp Queue pair that had the error
 * @param remote_proc Pointer to process that had the error
 * @param endpoint Pointer to endpoint that had the error
 */
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
                                          mca_btl_base_descriptor_t *des,
                                          int qp,
                                          opal_proc_t* remote_proc,
                                          mca_btl_openib_endpoint_t* endpoint)
{
    char *btlname = NULL;
    int btl_ownership;
    /* Since this BTL supports failover, it will call the PML error handler
     * function with the NONFATAL flag.  If the PML is running with failover
     * support, then it will map out the endpoint for further communication
     * and return control here.  If the PML does not have failover support,
     * it will abort the job and control will not return here. */

    /* Note: At this point, what needs to be done is based on the type
     * of openib fragment that got the error.  Also note that in the wc
     * struct, when wc->status != IBV_WC_SUCCESS, these are the only
     * valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num.
     * This means that one cannot key off of the wc->opcode to see what
     * operation was done.  The important information needs to be read
     * from the fragment. */

    /* Cannot issue callback to SRQ errors because the shared receive
     * queue is shared and is not specific to a connection.  There is no
     * way to figure out what type of message created the error because
     * we need the information in the wc->imm_data field which does not
     * exist when we have an error.  So, nothing to do here but return. */
    if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
        !BTL_OPENIB_QP_TYPE_PP(qp)) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "SRQ RECV type=%d", openib_frag_type(des));
        /* Need to think about returning any shared resources of the
         * SRQ.  For now, we do nothing as we rarely see an error on
         * the SRQ. */
        return;
    }
    assert(NULL != remote_proc);

    /* Create a nice string to help with debug */
    if (NULL != openib_btl) {
        asprintf(&btlname, "lid=%d:name=%s",
                 openib_btl->lid, openib_btl->device->ib_dev->name);
    }

    /* The next set of errors are associated with an endpoint, but not
     * with a PML descriptor.  They are not associated with a PML
     * descriptor because:
     *    A. It was a receive
     *    B. It was some type of openib specific control message.
     * Therefore, just drop the fragments and call up into the PML to
     * disable this endpoint for future communication. */
    if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
         (BTL_OPENIB_QP_TYPE_PP(qp))) ||
         (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) ||
         (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) {
        openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                              remote_proc, btlname);
        /* Now that this connection has been mapped out at the PML layer,
         * we change the state in the BTL layer.  The change in the PML
         * layer should prevent that we ever try to send on this BTL
         * again.  If we do, then this is an error case.  */
        if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
            endpoint->endpoint_state = MCA_BTL_IB_FAILED;
            mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
            error_out_all_pending_frags(endpoint, &openib_btl->super, true);
        }
        opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
                            "MCA_BTL_OPENIG_FRAG=%d, "
                            "dropping since connection is broken (des=%lx)",
                            openib_frag_type(des), (long unsigned int) des);
        if (NULL != btlname) free(btlname);
        return;
    }

    /* These are RDMA read type fragments.  Just continue with processing */
    if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) {
        OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "OPENIB_FRAG_RECV_USER fragment, "
                            "btl=%lx, continue with callbacks",
                            (long unsigned int) &openib_btl->super);
    }

    /* If we are at this point, we have completed a send, RDMA read or
     * RDMA write.  Call the PML callback function to map out this
     * btl for further sending.  We just call this every time we get an
     * error even though it is not necessary.  Subsequent calls with
     * the same remote_proc argument will not actually map anything out. */
    openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                         remote_proc, btlname);
    if (NULL != btlname) free(btlname);

    /* Since we believe we have done a send, read or write, then the
     * des_local fields should have valid data. */
    assert(des->des_local != NULL);

    /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
     * change the status.  Since this connection was mapped out in the
     * PML layer, no more attempts should be made to send on it.  In
     * addition, send a message to other end of the connection letting
     * it know that this side is now broken.  This is needed in the case
     * of a spurious error which may not cause the remote side to detect
     * the error.  */
    if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
        endpoint->endpoint_state = MCA_BTL_IB_FAILED;
        mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
    }

    /* Now, call the callback function associated with the fragment.
     * In case the fragments were coalesced we need to pull them apart
     * and call the callback function for each one. */
    if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
        opal_list_item_t *i;
        while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
            btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
            to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
                                             &to_base_frag(i)->base, OPAL_ERROR);
            if( btl_ownership ) {
                mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
            }
        }
    }

    /* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER
     * or MCA_BTL_OPENIB_FRAG_RECV_USER. */
    btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
    des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR);
    if( btl_ownership ) {
        mca_btl_openib_free(&openib_btl->super, des);
    }

    /* Here we send another control message to notify the remote side
     * we had an error on a eager fragment.  A non-zero value for the
     * ftr variable indicates that this was an eager RDMA fragment.
     * We need to do this in case the eager RDMA fragment after this
     * one actually made it successfully. */
    if (0 != to_send_frag(des)->ftr) {
        mca_btl_openib_endpoint_notify(endpoint,
                                       MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR,
                                       (long)to_send_frag(des)->ftr - 1);
    }

    /* We know we have completed a send so return some resources even
     * though connection is broken.  With SRQ, the resources are shared
     * so if we do not return the credits we may not be allowed to send
     * anymore. */
    qp_put_wqe(endpoint, qp);
    if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
        OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
    }

    /* There are several queues associated with an endpoint that may
     * have some unsent fragments sitting in them.  Remove them and
     * call the callback functions with an error so the PML can send
     * them down a different path.  This really only needs to be called
     * once on an endpoint, but for now, just call it a bunch of times.
     * The first time through will remove the unsent fragments so
     * subsequent calls are no-ops. */
    if (endpoint) {
        error_out_all_pending_frags(endpoint, &openib_btl->super, true);
    }
}
示例#20
0
/**
 * This function will find all the pending fragments on an endpoint
 * and call the callback function with OPAL_ERROR.  It walks through
 * each qp with each priority and looks for both no_credits_pending_frags
 * and no_wqe_pending_frags.  It then looks for any pending_lazy_frags,
 * pending_put_frags, and pending_get_frags.  This function is only
 * called when running with failover support enabled.  Note that
 * the errout parameter allows the function to also be used as a
 * debugging tool to see if there are any fragments on any of the
 * queues.
 * @param ep Pointer to endpoint that had error
 * @param module Pointer to module that had error
 * @param errout Boolean which says whether to error them out or not
 */
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
                                        struct mca_btl_base_module_t* module,
                                        bool errout)
{
    int qp, pri, len, total, btl_ownership;

    opal_list_item_t *item;
    mca_btl_openib_com_frag_t* frag;
    mca_btl_base_descriptor_t *des;
    int verbose = 10;  /* Verbosity level unless debugging */

    /* If debugging, drop verbosity level so we can see the output
     * regardless of the level the program was run with. */
    if (false == errout) {
	verbose = 0;
    }

    total = 0;
    /* Traverse all QPs and all priorities and move to other endpoint */
    for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
        for (pri = 0; pri < 2; ++pri) {
            /* All types of qp's have a no_wqe_pending_frags list */
            len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]);
            if (len > 0) {
                total += len;
                opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                    "IB: Checking for no_wqe_pending_frags qp=%d, "
                                    "pri=%d, list size=%d",
                                    qp, pri, len);
                if (true == errout) {
                    while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
                                                                  no_wqe_pending_frags[pri]))) {
                        frag = (mca_btl_openib_com_frag_t *) item;
                        des = (mca_btl_base_descriptor_t *)frag;

                        /* Error out any coalesced frags if they exist */
                        if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                            opal_list_item_t *i;
                            while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                    "IB: Found coalesced frag in no_wqe_pending_frags");
                                btl_ownership = (to_base_frag(i)->base.des_flags &
                                                 MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                 &to_base_frag(i)->base, OPAL_ERROR);
                                if( btl_ownership ) {
                                    mca_btl_openib_free(module, &to_base_frag(i)->base);
                                }
                            }
                        }
                        btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                        des->des_cbfunc(module, ep, des, OPAL_ERROR);
                        if( btl_ownership ) {
                            mca_btl_openib_free(module, des);
                        }
                    }
                }
            }
            if (BTL_OPENIB_QP_TYPE_PP(qp)) {
                len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
                if (len > 0) {
                    total += len;
                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                        "IB: Checking for no_credits_pending_frags qp=%d, "
                                        "pri=%d, list size=%d",
                                        qp, pri, len);
                    if (true == errout) {
                        while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
                                                                      no_credits_pending_frags[pri]))) {
                            frag = (mca_btl_openib_com_frag_t *) item;
                            des = (mca_btl_base_descriptor_t *)frag;

                            /* Error out any coalesced frags if they exist */
                            if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                                opal_list_item_t *i;
                                while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                        "IB: Found coalesced frag in "
                                                        "no_credits_pending_frags");
                                    btl_ownership = (to_base_frag(i)->base.des_flags &
                                                     MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                    to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                     &to_base_frag(i)->base, OPAL_ERROR);
                                    if( btl_ownership ) {
                                        mca_btl_openib_free(module, &to_base_frag(i)->base);
                                    }
                                }
                            }
                            btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                            des->des_cbfunc(module, ep, des, OPAL_ERROR);
                            if( btl_ownership ) {
                                mca_btl_openib_free(module, des);
                            }
                        }
                    }
                }

            } else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
                len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]);
                if (len > 0) {
                    total += len;
                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                        "IB: Checking for srq pending_frags qp=%d, pri=%d, "
                                        "list size=%d",
                                        qp, pri, len);
                    if (true == errout) {
                        while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
                                                                      u.srq_qp.pending_frags[pri]))) {
                            frag = (mca_btl_openib_com_frag_t *) item;
                            des = (mca_btl_base_descriptor_t *)frag;

                            /* Error out any coalesced frags if they exist */
                            if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                                opal_list_item_t *i;
                                while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                        "IB: Found coalesced frag in SRQ pending_frags");
                                    btl_ownership = (to_base_frag(i)->base.des_flags &
                                                     MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                    to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                     &to_base_frag(i)->base, OPAL_ERROR);
                                    if( btl_ownership ) {
                                        mca_btl_openib_free(module, &to_base_frag(i)->base);
                                    }
                                }
                            }
                            btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                            des->des_cbfunc(module, ep, des, OPAL_ERROR);
                            if( btl_ownership ) {
                                mca_btl_openib_free(module, des);
                            }
                        }
                    }
                }
            }
        }
    }

    /* Check for any frags from a connection that was never made.  Not sure if this
     * can actually happen. */
    len = opal_list_get_size(&ep->pending_lazy_frags);

    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_lazy_frags, list size=%d", len);
        if (true == errout) {
            while  (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OPAL_ERROR);
            }
        }
    }

    len = opal_list_get_size(&ep->pending_put_frags);
    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_put_frags, list size=%d", len);
        if (true == errout) {
            while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OPAL_ERROR);
            }
        }
    }

    len = opal_list_get_size(&ep->pending_get_frags);
    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_get_frags, list size=%d", len);
        if (true == errout) {
            while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OPAL_ERROR);
            }
        }
    }

    opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover,
                        "IB: Finished checking for pending_frags, total moved=%d",
                        total);
}
示例#21
0
void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
        const int qp)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_send_control_frag_t* frag;
    mca_btl_openib_rdma_credits_header_t *credits_hdr;
    int rc;
    bool do_rdma = false;
    int32_t cm_return;

    frag = endpoint->qps[qp].credit_frag;

    if(OPAL_UNLIKELY(NULL == frag)) {
        frag = alloc_control_frag(openib_btl);
        frag->qp_idx = qp;
        endpoint->qps[qp].credit_frag = frag;
        /* set those once and forever */
        to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
        to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
        to_base_frag(frag)->base.des_cbdata = NULL;
        to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;;
        to_com_frag(frag)->endpoint = endpoint;
        frag->hdr->tag = MCA_BTL_TAG_IB;
        to_base_frag(frag)->segment.seg_len =
            sizeof(mca_btl_openib_rdma_credits_header_t);
    }

    assert(frag->qp_idx == qp);
    credits_hdr = (mca_btl_openib_rdma_credits_header_t*)
        to_base_frag(frag)->segment.seg_addr.pval;
    if(OPAL_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) {
        do_rdma = true;
    } else {
        if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) >
                (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
            BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
            return;
        }
     }

    BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);

    frag->hdr->cm_seen = 0;
    BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    if(cm_return > 255) {
        frag->hdr->cm_seen = 255;
        cm_return -= 255;
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    } else {
        frag->hdr->cm_seen = cm_return;
    }

    BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
    credits_hdr->qpn = qp;
    credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS;

    if(endpoint->nbo)
         BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr);

    qp_reset_signal_count(endpoint, qp);
    if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0)
        return;

    if(endpoint->nbo) {
        BTL_OPENIB_HEADER_NTOH(*frag->hdr);
        BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr);
    }
    BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
    OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
            frag->hdr->credits);
    OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
            credits_hdr->rdma_credits);
    if(do_rdma)
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
    else
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);

    BTL_ERROR(("error posting send request errno %d says %s", rc,
                strerror(errno)));
}
示例#22
0
/* Setup eager RDMA buffers and notify the remote endpoint*/
void mca_btl_openib_endpoint_connect_eager_rdma(
        mca_btl_openib_endpoint_t* endpoint)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    char *buf;
    mca_btl_openib_recv_frag_t *headers_buf;
    int i;
    uint32_t flag = MCA_MPOOL_FLAGS_CACHE_BYPASS;

    /* Set local rdma pointer to 1 temporarily so other threads will not try
     * to enter the function */
    if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL,
                (void*)1))
        return;

    headers_buf = (mca_btl_openib_recv_frag_t*)
        malloc(sizeof(mca_btl_openib_recv_frag_t) *
            mca_btl_openib_component.eager_rdma_num);

    if(NULL == headers_buf)
       goto unlock_rdma_local;

#if HAVE_DECL_IBV_ACCESS_SO
    /* Solaris implements the Relaxed Ordering feature defined in the
       PCI Specification. With this in mind any memory region which
       relies on a buffer being written in a specific order, for
       example the eager rdma connections created in this routinue,
       must set a strong order flag when registering the memory for
       rdma operations.

       The following flag will be interpreted and the appropriate
       steps will be taken when the memory is registered in
       openib_reg_mr(). */
    flag |= MCA_MPOOL_FLAGS_SO_MEM;
#endif

    buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool,
            openib_btl->eager_rdma_frag_size *
            mca_btl_openib_component.eager_rdma_num,
            mca_btl_openib_component.buffer_alignment,
            flag,
            (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);

    if(!buf)
       goto free_headers_buf;

    buf = buf + openib_btl->eager_rdma_frag_size -
        sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit -
        sizeof(mca_btl_openib_header_t);

    for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
        opal_free_list_item_t *item;
        mca_btl_openib_recv_frag_t * frag;
        mca_btl_openib_frag_init_data_t init_data;

        item = (opal_free_list_item_t*)&headers_buf[i];
        item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg;
        item->ptr = buf + i * openib_btl->eager_rdma_frag_size;
        OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t);

        init_data.order = mca_btl_openib_component.credits_qp;
        init_data.list = NULL;

        mca_btl_openib_frag_init(item, &init_data);
        frag = to_recv_frag(item);
        to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
        to_com_frag(frag)->endpoint = endpoint;
        frag->ftr = (mca_btl_openib_footer_t*)
            ((char*)to_base_frag(frag)->segment.seg_addr.pval +
             mca_btl_openib_component.eager_limit);

        MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
    }

    endpoint->eager_rdma_local.frags = headers_buf;

    endpoint->eager_rdma_local.rd_win =
        mca_btl_openib_component.eager_rdma_num >> 2;
    endpoint->eager_rdma_local.rd_win =
        endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1;

    /* set local rdma pointer to real value */
    (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
                                 (void*)1, buf);

    if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == OPAL_SUCCESS) {
        mca_btl_openib_device_t *device = endpoint->endpoint_btl->device;
        mca_btl_openib_endpoint_t **p;
        OBJ_RETAIN(endpoint);
        assert(((opal_object_t*)endpoint)->obj_reference_count == 2);
        do {
            p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count];
        } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint));

        OPAL_THREAD_ADD32(&openib_btl->eager_rdma_channels, 1);
        /* from this point progress function starts to poll new buffer */
        OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1);
        return;
    }

    openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool,
           buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
free_headers_buf:
    free(headers_buf);
unlock_rdma_local:
    /* set local rdma pointer back to zero. Will retry later */
    (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
                                 endpoint->eager_rdma_local.base.pval, NULL);
    endpoint->eager_rdma_local.frags = NULL;
}
示例#23
0
static void put_constructor(mca_btl_openib_put_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER;
    to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE;
}
示例#24
0
static void put_constructor(mca_btl_wv_put_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_WV_FRAG_SEND_USER;
    to_out_frag(frag)->sr_desc.Opcode = WvRdmaWrite; 
}
示例#25
0
/**
 * This function is used to send a message to the remote side
 * indicating the endpoint is broken and telling the remote side to
 * brings its endpoint down as well.  This is needed because there are
 * cases where only one side of the connection determines that the
 * there was a problem.
 * @param endpoint Pointer to endpoint with error
 * @param type Type of message to be sent, can be one of two types
 * @param index When sending RDMA error message, index is non zero
 */
static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_module_t* newbtl = NULL;
    bool found = false;
    mca_btl_openib_broken_connection_header_t *bc_hdr;
    mca_btl_openib_send_control_frag_t* frag;
    mca_btl_base_endpoint_t* newep;
    int i, rc;
    opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal;

    /* First, find a different BTL than this one that got the
     * error to send the message over. */
    for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
        if (mca_btl_openib_component.openib_btls[i] != openib_btl) {
            newbtl = mca_btl_openib_component.openib_btls[i];
            break;
        }
    }
    if (NULL == newbtl) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No BTL found");
        /* If we cannot find one, then just return. */
        return;
    }

    /* Now, find the endpoint associated with it.  The device
     * associated with the BTL has the list of all the
     * endpoints. */
    for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) {
        newep = (mca_btl_openib_endpoint_t*)
            opal_pointer_array_get_item(newbtl->device->endpoints, i);
        if (NULL == newep) {
            continue;
        }
        if (newep->endpoint_proc->proc_opal == remote_proc) {
            found = true;
            break;
        }
    }
    if (false == found) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No endpoint found");
        /* If we cannot find a match, then just return. */
        return;
    }

    frag = alloc_control_frag(newbtl);
    if(NULL == frag) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No frag space");
        /* If no frag available, then just return. */
        return;
    }

    to_base_frag(frag)->base.des_cbfunc =
        mca_btl_openib_endpoint_notify_cb;
    to_base_frag(frag)->base.des_cbdata = NULL;
    to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
    to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
    to_base_frag(frag)->segment.base.seg_len =
        sizeof(mca_btl_openib_broken_connection_header_t);
    to_com_frag(frag)->endpoint = newep;

    frag->hdr->tag = MCA_BTL_TAG_IB;
    bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval;
    bc_hdr->control.type = type;
    bc_hdr->lid = endpoint->endpoint_btl->port_info.lid;
    bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id;
    bc_hdr->vpid = opal_process_name_vpid(OPAL_PROC_MY_NAME);
    bc_hdr->index = index;

    if(newep->nbo) {
        BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr));
    }
    rc = mca_btl_openib_endpoint_send(newep, frag);
    if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) {
        return;
    }

    MCA_BTL_IB_FRAG_RETURN(frag);
    BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno)));
    return;
}
示例#26
0
/* Setup eager RDMA buffers and notify the remote endpoint*/
void mca_btl_wv_endpoint_connect_eager_rdma(
        mca_btl_wv_endpoint_t* endpoint)
{
    mca_btl_wv_module_t* wv_btl = endpoint->endpoint_btl;
    char *buf;
    mca_btl_wv_recv_frag_t *headers_buf;
    int i;

    /* Set local rdma pointer to 1 temporarily so other threads will not try
     * to enter the function */
    if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL,
                (void*)1))
        return;

    headers_buf = (mca_btl_wv_recv_frag_t*)
        malloc(sizeof(mca_btl_wv_recv_frag_t) *
            mca_btl_wv_component.eager_rdma_num);

    if(NULL == headers_buf)
       goto unlock_rdma_local;

    buf = (char *) wv_btl->super.btl_mpool->mpool_alloc(wv_btl->super.btl_mpool,
            wv_btl->eager_rdma_frag_size *
            mca_btl_wv_component.eager_rdma_num,
            mca_btl_wv_component.buffer_alignment,
            MCA_MPOOL_FLAGS_CACHE_BYPASS,
            (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);

    if(!buf)
       goto free_headers_buf;

    buf = buf + wv_btl->eager_rdma_frag_size -
        sizeof(mca_btl_wv_footer_t) - wv_btl->super.btl_eager_limit -
        sizeof(mca_btl_wv_header_t);

    for(i = 0; i < mca_btl_wv_component.eager_rdma_num; i++) {
        ompi_free_list_item_t *item;
        mca_btl_wv_recv_frag_t * frag;
        mca_btl_wv_frag_init_data_t init_data;

        item = (ompi_free_list_item_t*)&headers_buf[i];
        item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg;
        item->ptr = buf + i * wv_btl->eager_rdma_frag_size;
        OBJ_CONSTRUCT(item, mca_btl_wv_recv_frag_t);

        init_data.order = mca_btl_wv_component.credits_qp;
        init_data.list = NULL;

        mca_btl_wv_frag_init(item, &init_data);
        frag = to_recv_frag(item);
        to_base_frag(frag)->type = MCA_BTL_WV_FRAG_EAGER_RDMA;
        to_com_frag(frag)->endpoint = endpoint;
        frag->ftr = (mca_btl_wv_footer_t*)
            ((char*)to_base_frag(frag)->segment.base.seg_addr.pval +
             mca_btl_wv_component.eager_limit);

        MCA_BTL_WV_RDMA_MAKE_REMOTE(frag->ftr);
    }

    endpoint->eager_rdma_local.frags = headers_buf;

    endpoint->eager_rdma_local.rd_win =
        mca_btl_wv_component.eager_rdma_num >> 2;
    endpoint->eager_rdma_local.rd_win =
        endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1;

    /* set local rdma pointer to real value */
    opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1,
            buf);

    if(mca_btl_wv_endpoint_send_eager_rdma(endpoint) == OMPI_SUCCESS) {
        mca_btl_wv_device_t *device = endpoint->endpoint_btl->device;
        mca_btl_wv_endpoint_t **p;
        OBJ_RETAIN(endpoint);
        assert(((opal_object_t*)endpoint)->obj_reference_count == 2);
        do {
            p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count];
        } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint));

        OPAL_THREAD_ADD32(&wv_btl->eager_rdma_channels, 1);
        /* from this point progress function starts to poll new buffer */
        OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1);
        return;
    }

    wv_btl->super.btl_mpool->mpool_free(wv_btl->super.btl_mpool,
           buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
free_headers_buf:
    free(headers_buf);
unlock_rdma_local:
    /* set local rdma pointer back to zero. Will retry later */
    opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
            endpoint->eager_rdma_local.base.pval, NULL);
    endpoint->eager_rdma_local.frags = NULL;
}
示例#27
0
/* this function is called with endpoint->endpoint_lock held */
int mca_btl_wv_endpoint_post_send(mca_btl_wv_endpoint_t *endpoint,
        mca_btl_wv_send_frag_t *frag)
{
    mca_btl_wv_header_t *hdr = frag->hdr;
    mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
    int qp, ib_rc;
    int32_t cm_return;
    bool do_rdma = false;
    size_t eager_limit;

    if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
        des->order = frag->qp_idx;

    qp = des->order;

    if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS)
        return OMPI_ERR_RESOURCE_BUSY;

    eager_limit = mca_btl_wv_component.eager_limit +
        sizeof(mca_btl_wv_header_coalesced_t) +
        sizeof(mca_btl_wv_control_header_t);
    if(des->des_src->seg_len + frag->coalesced_length <= eager_limit &&
            (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) {
        /* High priority frag. Try to send over eager RDMA */
        if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS)
            do_rdma = true;
    }

    if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) {
        qp_put_wqe(endpoint, qp);
        return OMPI_ERR_RESOURCE_BUSY;
    }

    BTL_WV_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
    if(hdr->credits)
        hdr->credits |= BTL_WV_RDMA_CREDITS_FLAG;

    if(!do_rdma) {
        if(BTL_WV_QP_TYPE_PP(qp) && 0 == hdr->credits) {
            BTL_WV_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
        }
    } else {
        hdr->credits |= (qp << 11);
    }

    BTL_WV_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    /* cm_seen is only 8 bytes, but cm_return is 32 bytes */
    if(cm_return > 255) {
        hdr->cm_seen = 255;
        cm_return -= 255;
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    } else {
        hdr->cm_seen = cm_return;
    }

    ib_rc = post_send(endpoint, frag, do_rdma);

    if(!ib_rc)
        return OMPI_SUCCESS;

    if(endpoint->nbo)
        BTL_WV_HEADER_NTOH(*hdr);

    if(BTL_WV_IS_RDMA_CREDITS(hdr->credits)) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
                BTL_WV_CREDITS(hdr->credits));
    }

    qp_put_wqe(endpoint, qp);

    if(do_rdma) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
    } else {
        if(BTL_WV_QP_TYPE_PP(qp)) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
                    hdr->credits);
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
        } else if BTL_WV_QP_TYPE_SRQ(qp){
            mca_btl_wv_module_t *wv_btl = endpoint->endpoint_btl;
            OPAL_THREAD_ADD32(&wv_btl->qps[qp].u.srq_qp.sd_credits, 1);
        }
    }
    BTL_ERROR(("error posting send request error %d: %s\n",
               ib_rc, strerror(ib_rc)));
    return OMPI_ERROR;
}
示例#28
0
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
                        uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
                        mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
                        int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
    mca_btl_openib_get_frag_t* frag = NULL;
    int qp = order;
    int rc;

    if (OPAL_UNLIKELY(size > btl->btl_get_limit)) {
        return OPAL_ERR_BAD_PARAM;
    }

    frag = to_get_frag(alloc_recv_user_frag());
    if (OPAL_UNLIKELY(NULL == frag)) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    if (MCA_BTL_NO_ORDER == qp) {
        qp = mca_btl_openib_component.rdma_qp;
    }

    /* set base descriptor flags */
    to_base_frag(frag)->base.order = qp;
    /* free this descriptor when the operation is complete */
    to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;

    /* set up scatter-gather entry */
    to_com_frag(frag)->sg_entry.length = size;
    to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
    to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
    to_com_frag(frag)->endpoint = ep;

    /* set up rdma callback */
    frag->cb.func = cbfunc;
    frag->cb.context = cbcontext;
    frag->cb.data = cbdata;
    frag->cb.local_handle = local_handle;

    /* set up descriptor */
    frag->sr_desc.wr.rdma.remote_addr = remote_address;
    /* the opcode may have been changed by an atomic operation */
    frag->sr_desc.opcode = IBV_WR_RDMA_READ;

#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
            != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
        frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey);
    } else
#endif
    {
        frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
    }

#if HAVE_XRC
    if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
        frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
#else
        frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
    }
#endif

    if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
        OPAL_THREAD_LOCK(&ep->endpoint_lock);
        rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
        OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
        if (OPAL_ERR_RESOURCE_BUSY == rc) {
            return OPAL_SUCCESS;
        }

        if (OPAL_SUCCESS != rc) {
            MCA_BTL_IB_FRAG_RETURN (frag);
            return rc;
        }
    }

    rc = mca_btl_openib_get_internal (btl, ep, frag);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
            rc = OPAL_SUCCESS;

            OPAL_THREAD_LOCK(&ep->endpoint_lock);
            opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
            OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
        } else {
            MCA_BTL_IB_FRAG_RETURN (frag);
        }
    }

    return rc;
}
示例#29
0
static void send_control_constructor(mca_btl_wv_send_control_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_WV_FRAG_CONTROL;
}