/*
 * Create the local side of all the qp's.  The remote sides will be
 * connected later.
 */
static int qp_create_all(mca_btl_base_endpoint_t* endpoint)
{
    int qp, rc, pp_qp_num = 0;
    int32_t rd_rsv_total = 0;

    for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp)
        if(BTL_OPENIB_QP_TYPE_PP(qp)) {
            rd_rsv_total +=
                mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv;
            pp_qp_num++;
        }

    /* if there is no pp QPs we still need reserved WQE for eager rdma flow
     * control */
    if(0 == pp_qp_num && true == endpoint->use_eager_rdma)
        pp_qp_num = 1;

    for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { 
        struct ibv_srq *srq = NULL;
        uint32_t max_recv_wr, max_send_wr;
        int32_t rd_rsv, rd_num_credits;

        /* QP used for SW flow control need some additional recourses */
        if(qp == mca_btl_openib_component.credits_qp) {
            rd_rsv = rd_rsv_total;
            rd_num_credits = pp_qp_num;
        } else {
            rd_rsv = rd_num_credits = 0;
        }

        if(BTL_OPENIB_QP_TYPE_PP(qp)) {
            max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num + rd_rsv;
            max_send_wr = mca_btl_openib_component.qp_infos[qp].rd_num +
                rd_num_credits;
        } else {
            srq = endpoint->endpoint_btl->qps[qp].u.srq_qp.srq;
            /* no receives are posted to SRQ qp */
            max_recv_wr = 0;
            max_send_wr = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max
                + rd_num_credits;
        }

        rc = qp_create_one(endpoint, qp, srq, max_recv_wr, max_send_wr);
        if (OMPI_SUCCESS != rc) {
            return rc;
        }
    }

    /* Now that all the qp's are created locally, post some receive
       buffers, setup credits, etc. */
    return mca_btl_openib_endpoint_post_recvs(endpoint);
}
static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint,
        mca_btl_openib_send_frag_t *frag)
{
    mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
    int qp = to_base_frag(frag)->base.order;
    int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY);

    if(BTL_OPENIB_QP_TYPE_PP(qp)) {
        if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
            opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio],
                    (opal_list_item_t *)frag);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    } else {
        if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0)
        {
            OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
            OPAL_THREAD_LOCK(&openib_btl->ib_lock);
            opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio],
                             (opal_list_item_t *)frag);
            OPAL_THREAD_UNLOCK(&openib_btl->ib_lock);
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }

    return OMPI_SUCCESS;
}
/* The main idea of resizing SRQ algorithm -
   We create a SRQ with size = rd_num, but for efficient usage of resources
   the number of WQEs that we post = rd_curr_num < rd_num and this value is
   increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function),
   the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */
static int btl_openib_async_srq_limit_event(struct ibv_srq* srq)
{
    int qp, rc = OPAL_SUCCESS;
    mca_btl_openib_module_t *openib_btl = NULL;

    opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock;
    opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table;

    opal_mutex_lock(lock);

    if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(srq_addr_table,
                            &srq, sizeof(struct ibv_srq*), (void*) &openib_btl)) {
        /* If there isn't any element with the key in the table =>
           we assume that SRQ was destroyed and don't serve the event */
        goto srq_limit_event_exit;
    }

    for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
        if (!BTL_OPENIB_QP_TYPE_PP(qp)) {
            if(openib_btl->qps[qp].u.srq_qp.srq == srq) {
                break;
            }
        }
    }

    if(qp >= mca_btl_openib_component.num_qps) {
        BTL_ERROR(("Open MPI tried to access a shared receive queue (SRQ) on the device %s that was not found.  This should not happen, and is a fatal error.  Your MPI job will now abort.\n", ibv_get_device_name(openib_btl->device->ib_dev)));
        rc = OPAL_ERROR;
        goto srq_limit_event_exit;
    }

    /* dynamically re-size the SRQ to be larger */
    openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1;

    if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >=
                         mca_btl_openib_component.qp_infos[qp].rd_num) {
        openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num;
        openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low;

        openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false;

        goto srq_limit_event_exit;
    }

    openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1;
    openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true;

srq_limit_event_exit:
    opal_mutex_unlock(lock);
    return rc;
}
Example #4
0
/*
 * Called when the connect module has created all the qp's on an
 * endpoint and needs to have some receive buffers posted.
 */
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint)
{
    int qp;

    for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
        if (BTL_OPENIB_QP_TYPE_PP(qp)) {
            mca_btl_openib_endpoint_post_rr_nolock(endpoint, qp);
        } else {
            mca_btl_openib_post_srr(endpoint->endpoint_btl, qp);
        }
    }

    return OPAL_SUCCESS;
}
/*
 * Find all the CPCs that are eligible for a single local port (i.e.,
 * openib module).
 */
int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t *btl)
{
    char *msg = NULL;
    int i, rc, cpc_index, len;
    ompi_btl_openib_connect_base_module_t **cpcs;

    cpcs = calloc(num_available, 
                  sizeof(ompi_btl_openib_connect_base_module_t *));
    if (NULL == cpcs) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* Go through all available CPCs and query them to see if they
       want to run on this module.  If they do, save them to a running
       array. */
    for (len = 1, i = 0; NULL != available[i]; ++i) {
        len += strlen(available[i]->cbc_name) + 2;
    }
    msg = malloc(len);
    if (NULL == msg) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    msg[0] = '\0';
    for (cpc_index = i = 0; NULL != available[i]; ++i) {
        if (i > 0) {
            strcat(msg, ", ");
        }
        strcat(msg, available[i]->cbc_name);

        rc = available[i]->cbc_query(btl, &cpcs[cpc_index]);
        if (OMPI_ERR_NOT_SUPPORTED == rc || OMPI_ERR_UNREACH == rc) {
            continue;
        } else if (OMPI_SUCCESS != rc) {
            free(cpcs);
            free(msg);
            return rc;
        }
        opal_output(-1, "match cpc for local port: %s",
                    available[i]->cbc_name);

        /* If the CPC wants to use the CTS protocol, check to ensure
           that QP 0 is PP; if it's not, we can't use this CPC (or the
           CTS protocol) */
        if (cpcs[cpc_index]->cbm_uses_cts &&
            !BTL_OPENIB_QP_TYPE_PP(0)) {
            BTL_VERBOSE(("this CPC only supports when the first btl_openib_receive_queues QP is a PP QP"));
            continue;
        }

        /* This CPC has indicated that it wants to run on this openib
           BTL module.  Woo hoo! */
        ++cpc_index;
    }

    /* If we got an empty array, then no CPCs were eligible.  Doh! */
    if (0 == cpc_index) {
        orte_show_help("help-mpi-btl-openib-cpc-base.txt",
                       "no cpcs for port", true,
                       orte_process_info.nodename,
                       ibv_get_device_name(btl->device->ib_dev),
                       btl->port_num, msg);
        free(cpcs);
        free(msg);
        return OMPI_ERR_NOT_SUPPORTED;
    }
    free(msg);

    /* We got at least one eligible CPC; save the array into the
       module's port_info */
    btl->cpcs = cpcs;
    btl->num_cpcs = cpc_index;

    return OMPI_SUCCESS;
}
Example #6
0
/**
 * This function is called when we get an error on the completion
 * event of a fragment.  We check to see what type of fragment it is
 * and act accordingly.  In most cases, we first call up into the PML
 * and have it map out this connection for any future communication.
 * In addition, this function will possibly send some control messages
 * over the other openib BTL.  The first control message will tell the
 * remote side to also map out this connection.  The second control
 * message makes sure the eager RDMA connection remains in a sane
 * state.  See that function for more details.
 * @param openib_btl Pointer to BTL that had the error
 * @param des Pointer to descriptor that had the error
 * @param qp Queue pair that had the error
 * @param remote_proc Pointer to process that had the error
 * @param endpoint Pointer to endpoint that had the error
 */
void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl,
                                          mca_btl_base_descriptor_t *des,
                                          int qp,
                                          opal_proc_t* remote_proc,
                                          mca_btl_openib_endpoint_t* endpoint)
{
    char *btlname = NULL;
    int btl_ownership;
    /* Since this BTL supports failover, it will call the PML error handler
     * function with the NONFATAL flag.  If the PML is running with failover
     * support, then it will map out the endpoint for further communication
     * and return control here.  If the PML does not have failover support,
     * it will abort the job and control will not return here. */

    /* Note: At this point, what needs to be done is based on the type
     * of openib fragment that got the error.  Also note that in the wc
     * struct, when wc->status != IBV_WC_SUCCESS, these are the only
     * valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num.
     * This means that one cannot key off of the wc->opcode to see what
     * operation was done.  The important information needs to be read
     * from the fragment. */

    /* Cannot issue callback to SRQ errors because the shared receive
     * queue is shared and is not specific to a connection.  There is no
     * way to figure out what type of message created the error because
     * we need the information in the wc->imm_data field which does not
     * exist when we have an error.  So, nothing to do here but return. */
    if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
        !BTL_OPENIB_QP_TYPE_PP(qp)) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "SRQ RECV type=%d", openib_frag_type(des));
        /* Need to think about returning any shared resources of the
         * SRQ.  For now, we do nothing as we rarely see an error on
         * the SRQ. */
        return;
    }
    assert(NULL != remote_proc);

    /* Create a nice string to help with debug */
    if (NULL != openib_btl) {
        asprintf(&btlname, "lid=%d:name=%s",
                 openib_btl->lid, openib_btl->device->ib_dev->name);
    }

    /* The next set of errors are associated with an endpoint, but not
     * with a PML descriptor.  They are not associated with a PML
     * descriptor because:
     *    A. It was a receive
     *    B. It was some type of openib specific control message.
     * Therefore, just drop the fragments and call up into the PML to
     * disable this endpoint for future communication. */
    if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) &&
         (BTL_OPENIB_QP_TYPE_PP(qp))) ||
         (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) ||
         (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) {
        openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                              remote_proc, btlname);
        /* Now that this connection has been mapped out at the PML layer,
         * we change the state in the BTL layer.  The change in the PML
         * layer should prevent that we ever try to send on this BTL
         * again.  If we do, then this is an error case.  */
        if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
            endpoint->endpoint_state = MCA_BTL_IB_FAILED;
            mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
            error_out_all_pending_frags(endpoint, &openib_btl->super, true);
        }
        opal_output_verbose(60, mca_btl_openib_component.verbose_failover,
                            "MCA_BTL_OPENIG_FRAG=%d, "
                            "dropping since connection is broken (des=%lx)",
                            openib_frag_type(des), (long unsigned int) des);
        if (NULL != btlname) free(btlname);
        return;
    }

    /* These are RDMA read type fragments.  Just continue with processing */
    if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) {
        OPAL_THREAD_ADD32(&endpoint->get_tokens, 1);
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "OPENIB_FRAG_RECV_USER fragment, "
                            "btl=%lx, continue with callbacks",
                            (long unsigned int) &openib_btl->super);
    }

    /* If we are at this point, we have completed a send, RDMA read or
     * RDMA write.  Call the PML callback function to map out this
     * btl for further sending.  We just call this every time we get an
     * error even though it is not necessary.  Subsequent calls with
     * the same remote_proc argument will not actually map anything out. */
    openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL,
                         remote_proc, btlname);
    if (NULL != btlname) free(btlname);

    /* Since we believe we have done a send, read or write, then the
     * des_local fields should have valid data. */
    assert(des->des_local != NULL);

    /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then
     * change the status.  Since this connection was mapped out in the
     * PML layer, no more attempts should be made to send on it.  In
     * addition, send a message to other end of the connection letting
     * it know that this side is now broken.  This is needed in the case
     * of a spurious error which may not cause the remote side to detect
     * the error.  */
    if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) {
        endpoint->endpoint_state = MCA_BTL_IB_FAILED;
        mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0);
    }

    /* Now, call the callback function associated with the fragment.
     * In case the fragments were coalesced we need to pull them apart
     * and call the callback function for each one. */
    if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
        opal_list_item_t *i;
        while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
            btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
            to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint,
                                             &to_base_frag(i)->base, OPAL_ERROR);
            if( btl_ownership ) {
                mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base);
            }
        }
    }

    /* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER
     * or MCA_BTL_OPENIB_FRAG_RECV_USER. */
    btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
    des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR);
    if( btl_ownership ) {
        mca_btl_openib_free(&openib_btl->super, des);
    }

    /* Here we send another control message to notify the remote side
     * we had an error on a eager fragment.  A non-zero value for the
     * ftr variable indicates that this was an eager RDMA fragment.
     * We need to do this in case the eager RDMA fragment after this
     * one actually made it successfully. */
    if (0 != to_send_frag(des)->ftr) {
        mca_btl_openib_endpoint_notify(endpoint,
                                       MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR,
                                       (long)to_send_frag(des)->ftr - 1);
    }

    /* We know we have completed a send so return some resources even
     * though connection is broken.  With SRQ, the resources are shared
     * so if we do not return the credits we may not be allowed to send
     * anymore. */
    qp_put_wqe(endpoint, qp);
    if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) {
        OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
    }

    /* There are several queues associated with an endpoint that may
     * have some unsent fragments sitting in them.  Remove them and
     * call the callback functions with an error so the PML can send
     * them down a different path.  This really only needs to be called
     * once on an endpoint, but for now, just call it a bunch of times.
     * The first time through will remove the unsent fragments so
     * subsequent calls are no-ops. */
    if (endpoint) {
        error_out_all_pending_frags(endpoint, &openib_btl->super, true);
    }
}
Example #7
0
/**
 * This function will find all the pending fragments on an endpoint
 * and call the callback function with OPAL_ERROR.  It walks through
 * each qp with each priority and looks for both no_credits_pending_frags
 * and no_wqe_pending_frags.  It then looks for any pending_lazy_frags,
 * pending_put_frags, and pending_get_frags.  This function is only
 * called when running with failover support enabled.  Note that
 * the errout parameter allows the function to also be used as a
 * debugging tool to see if there are any fragments on any of the
 * queues.
 * @param ep Pointer to endpoint that had error
 * @param module Pointer to module that had error
 * @param errout Boolean which says whether to error them out or not
 */
static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep,
                                        struct mca_btl_base_module_t* module,
                                        bool errout)
{
    int qp, pri, len, total, btl_ownership;

    opal_list_item_t *item;
    mca_btl_openib_com_frag_t* frag;
    mca_btl_base_descriptor_t *des;
    int verbose = 10;  /* Verbosity level unless debugging */

    /* If debugging, drop verbosity level so we can see the output
     * regardless of the level the program was run with. */
    if (false == errout) {
	verbose = 0;
    }

    total = 0;
    /* Traverse all QPs and all priorities and move to other endpoint */
    for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) {
        for (pri = 0; pri < 2; ++pri) {
            /* All types of qp's have a no_wqe_pending_frags list */
            len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]);
            if (len > 0) {
                total += len;
                opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                    "IB: Checking for no_wqe_pending_frags qp=%d, "
                                    "pri=%d, list size=%d",
                                    qp, pri, len);
                if (true == errout) {
                    while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
                                                                  no_wqe_pending_frags[pri]))) {
                        frag = (mca_btl_openib_com_frag_t *) item;
                        des = (mca_btl_base_descriptor_t *)frag;

                        /* Error out any coalesced frags if they exist */
                        if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                            opal_list_item_t *i;
                            while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                    "IB: Found coalesced frag in no_wqe_pending_frags");
                                btl_ownership = (to_base_frag(i)->base.des_flags &
                                                 MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                 &to_base_frag(i)->base, OPAL_ERROR);
                                if( btl_ownership ) {
                                    mca_btl_openib_free(module, &to_base_frag(i)->base);
                                }
                            }
                        }
                        btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                        des->des_cbfunc(module, ep, des, OPAL_ERROR);
                        if( btl_ownership ) {
                            mca_btl_openib_free(module, des);
                        }
                    }
                }
            }
            if (BTL_OPENIB_QP_TYPE_PP(qp)) {
                len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]);
                if (len > 0) {
                    total += len;
                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                        "IB: Checking for no_credits_pending_frags qp=%d, "
                                        "pri=%d, list size=%d",
                                        qp, pri, len);
                    if (true == errout) {
                        while (NULL != (item = opal_list_remove_first(&ep->qps[qp].
                                                                      no_credits_pending_frags[pri]))) {
                            frag = (mca_btl_openib_com_frag_t *) item;
                            des = (mca_btl_base_descriptor_t *)frag;

                            /* Error out any coalesced frags if they exist */
                            if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                                opal_list_item_t *i;
                                while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                        "IB: Found coalesced frag in "
                                                        "no_credits_pending_frags");
                                    btl_ownership = (to_base_frag(i)->base.des_flags &
                                                     MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                    to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                     &to_base_frag(i)->base, OPAL_ERROR);
                                    if( btl_ownership ) {
                                        mca_btl_openib_free(module, &to_base_frag(i)->base);
                                    }
                                }
                            }
                            btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                            des->des_cbfunc(module, ep, des, OPAL_ERROR);
                            if( btl_ownership ) {
                                mca_btl_openib_free(module, des);
                            }
                        }
                    }
                }

            } else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
                len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]);
                if (len > 0) {
                    total += len;
                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                        "IB: Checking for srq pending_frags qp=%d, pri=%d, "
                                        "list size=%d",
                                        qp, pri, len);
                    if (true == errout) {
                        while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp].
                                                                      u.srq_qp.pending_frags[pri]))) {
                            frag = (mca_btl_openib_com_frag_t *) item;
                            des = (mca_btl_base_descriptor_t *)frag;

                            /* Error out any coalesced frags if they exist */
                            if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) {
                                opal_list_item_t *i;
                                while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) {
                                    opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                                                        "IB: Found coalesced frag in SRQ pending_frags");
                                    btl_ownership = (to_base_frag(i)->base.des_flags &
                                                     MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                                    to_base_frag(i)->base.des_cbfunc(module, ep,
                                                                     &to_base_frag(i)->base, OPAL_ERROR);
                                    if( btl_ownership ) {
                                        mca_btl_openib_free(module, &to_base_frag(i)->base);
                                    }
                                }
                            }
                            btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
                            des->des_cbfunc(module, ep, des, OPAL_ERROR);
                            if( btl_ownership ) {
                                mca_btl_openib_free(module, des);
                            }
                        }
                    }
                }
            }
        }
    }

    /* Check for any frags from a connection that was never made.  Not sure if this
     * can actually happen. */
    len = opal_list_get_size(&ep->pending_lazy_frags);

    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_lazy_frags, list size=%d", len);
        if (true == errout) {
            while  (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OPAL_ERROR);
            }
        }
    }

    len = opal_list_get_size(&ep->pending_put_frags);
    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_put_frags, list size=%d", len);
        if (true == errout) {
            while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OPAL_ERROR);
            }
        }
    }

    len = opal_list_get_size(&ep->pending_get_frags);
    if (len > 0) {
        total += len;
        opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover,
                            "IB: Checking for pending_get_frags, list size=%d", len);
        if (true == errout) {
            while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) {
                frag = (mca_btl_openib_com_frag_t *) item;
                des = (mca_btl_base_descriptor_t *)frag;
                des->des_cbfunc(module, ep, des, OPAL_ERROR);
            }
        }
    }

    opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover,
                        "IB: Finished checking for pending_frags, total moved=%d",
                        total);
}
/* this function is called with endpoint->endpoint_lock held */
int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint,
        mca_btl_openib_send_frag_t *frag)
{
    mca_btl_openib_header_t *hdr = frag->hdr;
    mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base;
    int qp, ib_rc;
    int32_t cm_return;
    bool do_rdma = false;
    size_t eager_limit;

    if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER))
        des->order = frag->qp_idx;

    qp = des->order;

    if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS)
        return OMPI_ERR_RESOURCE_BUSY;

    eager_limit = mca_btl_openib_component.eager_limit +
        sizeof(mca_btl_openib_header_coalesced_t) +
        sizeof(mca_btl_openib_control_header_t);
    if(des->des_src->seg_len + frag->coalesced_length <= eager_limit &&
            (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) {
        /* High priority frag. Try to send over eager RDMA */
        if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS)
            do_rdma = true;
    }

    if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) {
        qp_put_wqe(endpoint, qp);
        return OMPI_ERR_RESOURCE_BUSY;
    }

    BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits);
    if(hdr->credits)
        hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG;

    if(!do_rdma) {
        if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) {
            BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits);
        }
    } else {
        hdr->credits |= (qp << 11);
    }

    BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    /* cm_seen is only 8 bytes, but cm_return is 32 bytes */
    if(cm_return > 255) {
        hdr->cm_seen = 255;
        cm_return -= 255;
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    } else {
        hdr->cm_seen = cm_return;
    }

    qp_reset_signal_count(endpoint, qp);
    ib_rc = post_send(endpoint, frag, do_rdma, 1);

    if(!ib_rc)
        return OMPI_SUCCESS;

    if(endpoint->nbo)
        BTL_OPENIB_HEADER_NTOH(*hdr);

    if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
                BTL_OPENIB_CREDITS(hdr->credits));
    }

    qp_put_wqe(endpoint, qp);

    if(do_rdma) {
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
    } else {
        if(BTL_OPENIB_QP_TYPE_PP(qp)) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
                    hdr->credits);
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1);
        } else if BTL_OPENIB_QP_TYPE_SRQ(qp){
            mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
            OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1);
        }
    }
    BTL_ERROR(("error posting send request error %d: %s\n",
               ib_rc, strerror(ib_rc)));
    return OMPI_ERROR;
}
/*
 * Create the local side of one qp.  The remote side will be connected
 * later.
 */
static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp, 
        struct ibv_srq *srq, uint32_t max_recv_wr, uint32_t max_send_wr)
{
    mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
    struct ibv_qp *my_qp;
    struct ibv_qp_init_attr init_attr;
    struct ibv_qp_attr attr;
    size_t req_inline;

    memset(&init_attr, 0, sizeof(init_attr));
    memset(&attr, 0, sizeof(attr));

    init_attr.qp_type = IBV_QPT_RC;
    init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ];
    init_attr.recv_cq = openib_btl->device->ib_cq[qp_cq_prio(qp)];
    init_attr.srq     = srq;
    init_attr.cap.max_inline_data = req_inline = 
        max_inline_size(qp, openib_btl->device);
    init_attr.cap.max_send_sge = 1;
    init_attr.cap.max_recv_sge = 1; /* we do not use SG list */
    if(BTL_OPENIB_QP_TYPE_PP(qp)) {
        init_attr.cap.max_recv_wr  = max_recv_wr;
    } else {
        init_attr.cap.max_recv_wr  = 0;
    }
    init_attr.cap.max_send_wr  = max_send_wr;

    my_qp = ibv_create_qp(openib_btl->device->ib_pd, &init_attr); 
    
    if (NULL == my_qp) { 
        BTL_ERROR(("error creating qp errno says %s", strerror(errno))); 
        return OMPI_ERROR; 
    }
    endpoint->qps[qp].qp->lcl_qp = my_qp;

    if (init_attr.cap.max_inline_data < req_inline) {
        endpoint->qps[qp].ib_inline_max = init_attr.cap.max_inline_data;
        orte_show_help("help-mpi-btl-openib-cpc-base.txt",
                       "inline truncated", true, orte_process_info.nodename,
                       ibv_get_device_name(openib_btl->device->ib_dev),
                       openib_btl->port_num,
                       req_inline, init_attr.cap.max_inline_data);
    } else {
        endpoint->qps[qp].ib_inline_max = req_inline;
    }
    
    attr.qp_state        = IBV_QPS_INIT;
    attr.pkey_index      = openib_btl->pkey_index;
    attr.port_num        = openib_btl->port_num;
    attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;

    if (ibv_modify_qp(endpoint->qps[qp].qp->lcl_qp, 
                      &attr, 
                      IBV_QP_STATE | 
                      IBV_QP_PKEY_INDEX | 
                      IBV_QP_PORT | 
                      IBV_QP_ACCESS_FLAGS )) { 
        BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno))); 
        return OMPI_ERROR; 
    } 

    /* Setup meta data on the endpoint */
    endpoint->qps[qp].qp->lcl_psn = lrand48() & 0xffffff;
    endpoint->qps[qp].credit_frag = NULL;

    return OMPI_SUCCESS;
}
/*
 * Connect the local ends of all qp's to the remote side
 */
static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint)
{
    int i;
    mca_btl_openib_module_t* openib_btl =
        (mca_btl_openib_module_t*)endpoint->endpoint_btl;

    for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
        struct ibv_qp_attr attr;
        struct ibv_qp* qp = endpoint->qps[i].qp->lcl_qp;
        enum ibv_mtu mtu = (openib_btl->device->mtu < endpoint->rem_info.rem_mtu) ?
            openib_btl->device->mtu : endpoint->rem_info.rem_mtu;

        memset(&attr, 0, sizeof(attr));
        attr.qp_state           = IBV_QPS_RTR;
        attr.path_mtu           = mtu;
        attr.dest_qp_num        = endpoint->rem_info.rem_qps[i].rem_qp_num;
        attr.rq_psn             = endpoint->rem_info.rem_qps[i].rem_psn;
        attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops;
        attr.min_rnr_timer  = mca_btl_openib_component.ib_min_rnr_timer;
        attr.ah_attr.is_global     = 0;
        attr.ah_attr.dlid          = endpoint->rem_info.rem_lid;
        attr.ah_attr.sl            = mca_btl_openib_component.ib_service_level;
        attr.ah_attr.src_path_bits = openib_btl->src_path_bits;
        attr.ah_attr.port_num      = openib_btl->port_num;
        /* JMS to be filled in later dynamically */
        attr.ah_attr.static_rate   = 0;

        if (mca_btl_openib_component.verbose) {
            BTL_OUTPUT(("Set MTU to IBV value %d (%s bytes)", mtu,
                        (mtu == IBV_MTU_256) ? "256" :
                        (mtu == IBV_MTU_512) ? "512" :
                        (mtu == IBV_MTU_1024) ? "1024" :
                        (mtu == IBV_MTU_2048) ? "2048" :
                        (mtu == IBV_MTU_4096) ? "4096" :
                        "unknown (!)"));
        }

        if (ibv_modify_qp(qp, &attr,
                          IBV_QP_STATE              |
                          IBV_QP_AV                 |
                          IBV_QP_PATH_MTU           |
                          IBV_QP_DEST_QPN           |
                          IBV_QP_RQ_PSN             |
                          IBV_QP_MAX_DEST_RD_ATOMIC |
                          IBV_QP_MIN_RNR_TIMER)) {
            BTL_ERROR(("error modifing QP to RTR errno says %s",
                       strerror(errno)));
            return OMPI_ERROR; 
        }
        attr.qp_state       = IBV_QPS_RTS;
        attr.timeout        = mca_btl_openib_component.ib_timeout;
        attr.retry_cnt      = mca_btl_openib_component.ib_retry_count;
        /* On PP QPs we have SW flow control, no need for rnr retries. Setting
         * it to zero helps to catch bugs */
        attr.rnr_retry      = BTL_OPENIB_QP_TYPE_PP(i) ? 0 :
            mca_btl_openib_component.ib_rnr_retry;
        attr.sq_psn         = endpoint->qps[i].qp->lcl_psn;
        attr.max_rd_atomic  = mca_btl_openib_component.ib_max_rdma_dst_ops;
        if (ibv_modify_qp(qp, &attr,
                          IBV_QP_STATE              |
                          IBV_QP_TIMEOUT            |
                          IBV_QP_RETRY_CNT          |
                          IBV_QP_RNR_RETRY          |
                          IBV_QP_SQ_PSN             |
                          IBV_QP_MAX_QP_RD_ATOMIC)) {
            BTL_ERROR(("error modifying QP to RTS errno says %s",
                       strerror(errno)));
            return OMPI_ERROR;
        }
    }

    return OMPI_SUCCESS;
}