Ejemplo n.º 1
0
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep,
                                 mca_btl_openib_get_frag_t *frag)
{
    int qp = to_base_frag(frag)->base.order;
    struct ibv_send_wr *bad_wr;

    /* check for a send wqe */
    if (qp_get_wqe(ep, qp) < 0) {
        qp_put_wqe(ep, qp);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    /* check for a get token */
    if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) {
        qp_put_wqe(ep, qp);
        OPAL_THREAD_ADD32(&ep->get_tokens,1);
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag));
    qp_reset_signal_count(ep, qp);

    if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) {
        qp_put_wqe(ep, qp);
        OPAL_THREAD_ADD32(&ep->get_tokens,1);
        return OPAL_ERROR;
    }

    return OPAL_SUCCESS;
}
Ejemplo n.º 2
0
static void recv_constructor(mca_btl_wv_recv_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);

    base_frag->type = MCA_BTL_WV_FRAG_RECV;

    frag->hdr = (mca_btl_wv_header_t*)base_frag->base.super.ptr;
    base_frag->segment.seg_addr.pval =
        ((unsigned char* )frag->hdr) + sizeof(mca_btl_wv_header_t);
    to_com_frag(frag)->sg_entry.pAddress = (void*)(uintptr_t)frag->hdr;

    frag->rd_desc.wr_id = (uint64_t)(uintptr_t)frag;
    frag->rd_desc.sg_list = (WV_SGE*)&to_com_frag(frag)->sg_entry;
    frag->rd_desc.num_sge = 1;
    frag->rd_desc.next = NULL;
}
Ejemplo n.º 3
0
static void send_control_constructor(mca_btl_openib_send_control_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_CONTROL;
    /* adjusting headers because there is no coalesce header in control messages */
    frag->hdr = frag->chdr;
    to_base_frag(frag)->segment.seg_addr.pval = frag->hdr + 1;
    to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t)frag->hdr;
}
Ejemplo n.º 4
0
/* send the eager rdma connect message to the remote endpoint */
static int mca_btl_openib_endpoint_send_eager_rdma(
    mca_btl_base_endpoint_t* endpoint)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_eager_rdma_header_t *rdma_hdr;
    mca_btl_openib_send_control_frag_t* frag;
    int rc;

    frag = alloc_control_frag(openib_btl);
    if(NULL == frag) {
        return -1;
    }

    to_base_frag(frag)->base.des_cbfunc =
        mca_btl_openib_endpoint_eager_rdma_connect_cb;
    to_base_frag(frag)->base.des_cbdata = NULL;
    to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
    to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
    to_base_frag(frag)->segment.seg_len =
        sizeof(mca_btl_openib_eager_rdma_header_t);
    to_com_frag(frag)->endpoint = endpoint;

    frag->hdr->tag = MCA_BTL_TAG_IB;
    rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval;
    rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA;
    rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey;
    rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval);
    BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64
                 ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n",
                 rdma_hdr->rkey,
                 rdma_hdr->rdma_start.lval,
                 rdma_hdr->rdma_start.pval,
                 rdma_hdr->rdma_start.ival,
                 rdma_hdr->control.type,
                 (int) sizeof(mca_btl_openib_eager_rdma_header_t)
                 ));

    if(endpoint->nbo) {
        BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr));

        BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n",
                     rdma_hdr->rkey,
                     rdma_hdr->rdma_start.lval,
                     rdma_hdr->rdma_start.pval,
                     rdma_hdr->rdma_start.ival
                     ));
    }
    rc = mca_btl_openib_endpoint_send(endpoint, frag);
    if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc)
        return OPAL_SUCCESS;

    MCA_BTL_IB_FRAG_RETURN(frag);
    BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno)));
    return rc;
}
Ejemplo n.º 5
0
static void get_constructor(mca_btl_wv_get_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_WV_FRAG_RECV_USER;

    frag->sr_desc.WrId = (uint64_t)(uintptr_t)frag;
    frag->sr_desc.pSgl = (WV_SGE*)&to_com_frag(frag)->sg_entry;
    frag->sr_desc.nSge = 1;
    frag->sr_desc.Opcode = WvRdmaRead;
    frag->sr_desc.Flags = WV_SEND_SIGNALED;
    frag->sr_desc.pNext = NULL;
}
Ejemplo n.º 6
0
static void get_constructor(mca_btl_openib_get_frag_t *frag)
{
    to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_RECV_USER;

    frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag;
    frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
    frag->sr_desc.num_sge = 1;
    frag->sr_desc.opcode = IBV_WR_RDMA_READ;
    frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
    frag->sr_desc.next = NULL;
}
Ejemplo n.º 7
0
static void send_constructor(mca_btl_wv_send_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);

    base_frag->type = MCA_BTL_WV_FRAG_SEND;

    frag->chdr = (mca_btl_wv_header_t*)base_frag->base.super.ptr;
    frag->hdr = (mca_btl_wv_header_t*)
        (((unsigned char*)base_frag->base.super.ptr) +
        sizeof(mca_btl_wv_header_coalesced_t) +
        sizeof(mca_btl_wv_control_header_t));
    base_frag->segment.seg_addr.pval = frag->hdr + 1;
    to_com_frag(frag)->sg_entry.pAddress = (void*)(uintptr_t)frag->hdr;
    frag->coalesced_length = 0;
    OBJ_CONSTRUCT(&frag->coalesced_frags, opal_list_t);
}
Ejemplo n.º 8
0
static void out_constructor(mca_btl_wv_out_frag_t *frag)
{
    mca_btl_wv_frag_t *base_frag = to_base_frag(frag);

    base_frag->base.des_src = &base_frag->segment;
    base_frag->base.des_src_cnt = 1;
    base_frag->base.des_dst = NULL;
    base_frag->base.des_dst_cnt = 0;

    frag->sr_desc.WrId = (uint64_t)(uintptr_t)frag;
    frag->sr_desc.pSgl = (WV_SGE*)&to_com_frag(frag)->sg_entry; 
    frag->sr_desc.nSge = 1;
    frag->sr_desc.Opcode = WvSend;
    frag->sr_desc.Flags = WV_SEND_SIGNALED;
    frag->sr_desc.pNext = NULL;
}
Ejemplo n.º 9
0
static void out_constructor(mca_btl_openib_out_frag_t *frag)
{
    mca_btl_openib_frag_t *base_frag = to_base_frag(frag);

    base_frag->base.des_src = &base_frag->segment;
    base_frag->base.des_src_cnt = 1;
    base_frag->base.des_dst = NULL;
    base_frag->base.des_dst_cnt = 0;

    frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag;
    frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry;
    frag->sr_desc.num_sge = 1;
    frag->sr_desc.opcode = IBV_WR_SEND;
    frag->sr_desc.send_flags = IBV_SEND_SIGNALED;
    frag->sr_desc.next = NULL;
}
Ejemplo n.º 10
0
void mca_btl_wv_frag_init(ompi_free_list_item_t* item, void* ctx)
{
    mca_btl_wv_frag_init_data_t* init_data = (mca_btl_wv_frag_init_data_t *) ctx;
    mca_btl_wv_frag_t *frag = to_base_frag(item);

    if(MCA_BTL_WV_FRAG_RECV == frag->type) {
        to_recv_frag(frag)->qp_idx = init_data->order;
        to_com_frag(frag)->sg_entry.Length =
            mca_btl_wv_component.qp_infos[init_data->order].size +
            sizeof(mca_btl_wv_header_t) +
            sizeof(mca_btl_wv_header_coalesced_t) +
            sizeof(mca_btl_wv_control_header_t);
    }

    if(MCA_BTL_WV_FRAG_SEND == frag->type)
        to_send_frag(frag)->qp_idx = init_data->order;

    frag->list = init_data->list;
}
Ejemplo n.º 11
0
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address,
                        uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
                        mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
                        int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
    mca_btl_openib_get_frag_t* frag = NULL;
    int qp = order;
    int rc;

    if (OPAL_UNLIKELY(size > btl->btl_get_limit)) {
        return OPAL_ERR_BAD_PARAM;
    }

    frag = to_get_frag(alloc_recv_user_frag());
    if (OPAL_UNLIKELY(NULL == frag)) {
        return OPAL_ERR_OUT_OF_RESOURCE;
    }

    if (MCA_BTL_NO_ORDER == qp) {
        qp = mca_btl_openib_component.rdma_qp;
    }

    /* set base descriptor flags */
    to_base_frag(frag)->base.order = qp;
    /* free this descriptor when the operation is complete */
    to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;

    /* set up scatter-gather entry */
    to_com_frag(frag)->sg_entry.length = size;
    to_com_frag(frag)->sg_entry.lkey = local_handle->lkey;
    to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address;
    to_com_frag(frag)->endpoint = ep;

    /* set up rdma callback */
    frag->cb.func = cbfunc;
    frag->cb.context = cbcontext;
    frag->cb.data = cbdata;
    frag->cb.local_handle = local_handle;

    /* set up descriptor */
    frag->sr_desc.wr.rdma.remote_addr = remote_address;
    /* the opcode may have been changed by an atomic operation */
    frag->sr_desc.opcode = IBV_WR_RDMA_READ;

#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN)
            != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) {
        frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey);
    } else
#endif
    {
        frag->sr_desc.wr.rdma.rkey = remote_handle->rkey;
    }

#if HAVE_XRC
    if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) {
#if OPAL_HAVE_CONNECTX_XRC_DOMAINS
        frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num;
#else
        frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num;
#endif
    }
#endif

    if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) {
        OPAL_THREAD_LOCK(&ep->endpoint_lock);
        rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags);
        OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
        if (OPAL_ERR_RESOURCE_BUSY == rc) {
            return OPAL_SUCCESS;
        }

        if (OPAL_SUCCESS != rc) {
            MCA_BTL_IB_FRAG_RETURN (frag);
            return rc;
        }
    }

    rc = mca_btl_openib_get_internal (btl, ep, frag);
    if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
        if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) {
            rc = OPAL_SUCCESS;

            OPAL_THREAD_LOCK(&ep->endpoint_lock);
            opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag);
            OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
        } else {
            MCA_BTL_IB_FRAG_RETURN (frag);
        }
    }

    return rc;
}
Ejemplo n.º 12
0
/**
 * This function is used to send a message to the remote side
 * indicating the endpoint is broken and telling the remote side to
 * brings its endpoint down as well.  This is needed because there are
 * cases where only one side of the connection determines that the
 * there was a problem.
 * @param endpoint Pointer to endpoint with error
 * @param type Type of message to be sent, can be one of two types
 * @param index When sending RDMA error message, index is non zero
 */
static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_module_t* newbtl = NULL;
    bool found = false;
    mca_btl_openib_broken_connection_header_t *bc_hdr;
    mca_btl_openib_send_control_frag_t* frag;
    mca_btl_base_endpoint_t* newep;
    int i, rc;
    opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal;

    /* First, find a different BTL than this one that got the
     * error to send the message over. */
    for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
        if (mca_btl_openib_component.openib_btls[i] != openib_btl) {
            newbtl = mca_btl_openib_component.openib_btls[i];
            break;
        }
    }
    if (NULL == newbtl) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No BTL found");
        /* If we cannot find one, then just return. */
        return;
    }

    /* Now, find the endpoint associated with it.  The device
     * associated with the BTL has the list of all the
     * endpoints. */
    for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) {
        newep = (mca_btl_openib_endpoint_t*)
            opal_pointer_array_get_item(newbtl->device->endpoints, i);
        if (NULL == newep) {
            continue;
        }
        if (newep->endpoint_proc->proc_opal == remote_proc) {
            found = true;
            break;
        }
    }
    if (false == found) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No endpoint found");
        /* If we cannot find a match, then just return. */
        return;
    }

    frag = alloc_control_frag(newbtl);
    if(NULL == frag) {
        opal_output_verbose(20, mca_btl_openib_component.verbose_failover,
                            "IB: Endpoint Notify: No frag space");
        /* If no frag available, then just return. */
        return;
    }

    to_base_frag(frag)->base.des_cbfunc =
        mca_btl_openib_endpoint_notify_cb;
    to_base_frag(frag)->base.des_cbdata = NULL;
    to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
    to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
    to_base_frag(frag)->segment.base.seg_len =
        sizeof(mca_btl_openib_broken_connection_header_t);
    to_com_frag(frag)->endpoint = newep;

    frag->hdr->tag = MCA_BTL_TAG_IB;
    bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval;
    bc_hdr->control.type = type;
    bc_hdr->lid = endpoint->endpoint_btl->port_info.lid;
    bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id;
    bc_hdr->vpid = opal_process_name_vpid(OPAL_PROC_MY_NAME);
    bc_hdr->index = index;

    if(newep->nbo) {
        BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr));
    }
    rc = mca_btl_openib_endpoint_send(newep, frag);
    if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) {
        return;
    }

    MCA_BTL_IB_FRAG_RETURN(frag);
    BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno)));
    return;
}
Ejemplo n.º 13
0
/* Setup eager RDMA buffers and notify the remote endpoint*/
void mca_btl_openib_endpoint_connect_eager_rdma(
        mca_btl_openib_endpoint_t* endpoint)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    char *buf;
    mca_btl_openib_recv_frag_t *headers_buf;
    int i;
    uint32_t flag = MCA_MPOOL_FLAGS_CACHE_BYPASS;

    /* Set local rdma pointer to 1 temporarily so other threads will not try
     * to enter the function */
    if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL,
                (void*)1))
        return;

    headers_buf = (mca_btl_openib_recv_frag_t*)
        malloc(sizeof(mca_btl_openib_recv_frag_t) *
            mca_btl_openib_component.eager_rdma_num);

    if(NULL == headers_buf)
       goto unlock_rdma_local;

#if HAVE_DECL_IBV_ACCESS_SO
    /* Solaris implements the Relaxed Ordering feature defined in the
       PCI Specification. With this in mind any memory region which
       relies on a buffer being written in a specific order, for
       example the eager rdma connections created in this routinue,
       must set a strong order flag when registering the memory for
       rdma operations.

       The following flag will be interpreted and the appropriate
       steps will be taken when the memory is registered in
       openib_reg_mr(). */
    flag |= MCA_MPOOL_FLAGS_SO_MEM;
#endif

    buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool,
            openib_btl->eager_rdma_frag_size *
            mca_btl_openib_component.eager_rdma_num,
            mca_btl_openib_component.buffer_alignment,
            flag,
            (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);

    if(!buf)
       goto free_headers_buf;

    buf = buf + openib_btl->eager_rdma_frag_size -
        sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit -
        sizeof(mca_btl_openib_header_t);

    for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) {
        opal_free_list_item_t *item;
        mca_btl_openib_recv_frag_t * frag;
        mca_btl_openib_frag_init_data_t init_data;

        item = (opal_free_list_item_t*)&headers_buf[i];
        item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg;
        item->ptr = buf + i * openib_btl->eager_rdma_frag_size;
        OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t);

        init_data.order = mca_btl_openib_component.credits_qp;
        init_data.list = NULL;

        mca_btl_openib_frag_init(item, &init_data);
        frag = to_recv_frag(item);
        to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA;
        to_com_frag(frag)->endpoint = endpoint;
        frag->ftr = (mca_btl_openib_footer_t*)
            ((char*)to_base_frag(frag)->segment.seg_addr.pval +
             mca_btl_openib_component.eager_limit);

        MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr);
    }

    endpoint->eager_rdma_local.frags = headers_buf;

    endpoint->eager_rdma_local.rd_win =
        mca_btl_openib_component.eager_rdma_num >> 2;
    endpoint->eager_rdma_local.rd_win =
        endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1;

    /* set local rdma pointer to real value */
    (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
                                 (void*)1, buf);

    if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == OPAL_SUCCESS) {
        mca_btl_openib_device_t *device = endpoint->endpoint_btl->device;
        mca_btl_openib_endpoint_t **p;
        OBJ_RETAIN(endpoint);
        assert(((opal_object_t*)endpoint)->obj_reference_count == 2);
        do {
            p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count];
        } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint));

        OPAL_THREAD_ADD32(&openib_btl->eager_rdma_channels, 1);
        /* from this point progress function starts to poll new buffer */
        OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1);
        return;
    }

    openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool,
           buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
free_headers_buf:
    free(headers_buf);
unlock_rdma_local:
    /* set local rdma pointer back to zero. Will retry later */
    (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
                                 endpoint->eager_rdma_local.base.pval, NULL);
    endpoint->eager_rdma_local.frags = NULL;
}
Ejemplo n.º 14
0
void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint,
        const int qp)
{
    mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl;
    mca_btl_openib_send_control_frag_t* frag;
    mca_btl_openib_rdma_credits_header_t *credits_hdr;
    int rc;
    bool do_rdma = false;
    int32_t cm_return;

    frag = endpoint->qps[qp].credit_frag;

    if(OPAL_UNLIKELY(NULL == frag)) {
        frag = alloc_control_frag(openib_btl);
        frag->qp_idx = qp;
        endpoint->qps[qp].credit_frag = frag;
        /* set those once and forever */
        to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp;
        to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits;
        to_base_frag(frag)->base.des_cbdata = NULL;
        to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;;
        to_com_frag(frag)->endpoint = endpoint;
        frag->hdr->tag = MCA_BTL_TAG_IB;
        to_base_frag(frag)->segment.seg_len =
            sizeof(mca_btl_openib_rdma_credits_header_t);
    }

    assert(frag->qp_idx == qp);
    credits_hdr = (mca_btl_openib_rdma_credits_header_t*)
        to_base_frag(frag)->segment.seg_addr.pval;
    if(OPAL_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) {
        do_rdma = true;
    } else {
        if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) >
                (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) {
            OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);
            BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
            return;
        }
     }

    BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits);

    frag->hdr->cm_seen = 0;
    BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    if(cm_return > 255) {
        frag->hdr->cm_seen = 255;
        cm_return -= 255;
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return);
    } else {
        frag->hdr->cm_seen = cm_return;
    }

    BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits);
    credits_hdr->qpn = qp;
    credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS;

    if(endpoint->nbo)
         BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr);

    qp_reset_signal_count(endpoint, qp);
    if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0)
        return;

    if(endpoint->nbo) {
        BTL_OPENIB_HEADER_NTOH(*frag->hdr);
        BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr);
    }
    BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp);
    OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits,
            frag->hdr->credits);
    OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits,
            credits_hdr->rdma_credits);
    if(do_rdma)
        OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1);
    else
        OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1);

    BTL_ERROR(("error posting send request errno %d says %s", rc,
                strerror(errno)));
}
Ejemplo n.º 15
0
/* Setup eager RDMA buffers and notify the remote endpoint*/
void mca_btl_wv_endpoint_connect_eager_rdma(
        mca_btl_wv_endpoint_t* endpoint)
{
    mca_btl_wv_module_t* wv_btl = endpoint->endpoint_btl;
    char *buf;
    mca_btl_wv_recv_frag_t *headers_buf;
    int i;

    /* Set local rdma pointer to 1 temporarily so other threads will not try
     * to enter the function */
    if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL,
                (void*)1))
        return;

    headers_buf = (mca_btl_wv_recv_frag_t*)
        malloc(sizeof(mca_btl_wv_recv_frag_t) *
            mca_btl_wv_component.eager_rdma_num);

    if(NULL == headers_buf)
       goto unlock_rdma_local;

    buf = (char *) wv_btl->super.btl_mpool->mpool_alloc(wv_btl->super.btl_mpool,
            wv_btl->eager_rdma_frag_size *
            mca_btl_wv_component.eager_rdma_num,
            mca_btl_wv_component.buffer_alignment,
            MCA_MPOOL_FLAGS_CACHE_BYPASS,
            (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);

    if(!buf)
       goto free_headers_buf;

    buf = buf + wv_btl->eager_rdma_frag_size -
        sizeof(mca_btl_wv_footer_t) - wv_btl->super.btl_eager_limit -
        sizeof(mca_btl_wv_header_t);

    for(i = 0; i < mca_btl_wv_component.eager_rdma_num; i++) {
        ompi_free_list_item_t *item;
        mca_btl_wv_recv_frag_t * frag;
        mca_btl_wv_frag_init_data_t init_data;

        item = (ompi_free_list_item_t*)&headers_buf[i];
        item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg;
        item->ptr = buf + i * wv_btl->eager_rdma_frag_size;
        OBJ_CONSTRUCT(item, mca_btl_wv_recv_frag_t);

        init_data.order = mca_btl_wv_component.credits_qp;
        init_data.list = NULL;

        mca_btl_wv_frag_init(item, &init_data);
        frag = to_recv_frag(item);
        to_base_frag(frag)->type = MCA_BTL_WV_FRAG_EAGER_RDMA;
        to_com_frag(frag)->endpoint = endpoint;
        frag->ftr = (mca_btl_wv_footer_t*)
            ((char*)to_base_frag(frag)->segment.base.seg_addr.pval +
             mca_btl_wv_component.eager_limit);

        MCA_BTL_WV_RDMA_MAKE_REMOTE(frag->ftr);
    }

    endpoint->eager_rdma_local.frags = headers_buf;

    endpoint->eager_rdma_local.rd_win =
        mca_btl_wv_component.eager_rdma_num >> 2;
    endpoint->eager_rdma_local.rd_win =
        endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1;

    /* set local rdma pointer to real value */
    opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1,
            buf);

    if(mca_btl_wv_endpoint_send_eager_rdma(endpoint) == OMPI_SUCCESS) {
        mca_btl_wv_device_t *device = endpoint->endpoint_btl->device;
        mca_btl_wv_endpoint_t **p;
        OBJ_RETAIN(endpoint);
        assert(((opal_object_t*)endpoint)->obj_reference_count == 2);
        do {
            p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count];
        } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint));

        OPAL_THREAD_ADD32(&wv_btl->eager_rdma_channels, 1);
        /* from this point progress function starts to poll new buffer */
        OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1);
        return;
    }

    wv_btl->super.btl_mpool->mpool_free(wv_btl->super.btl_mpool,
           buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
free_headers_buf:
    free(headers_buf);
unlock_rdma_local:
    /* set local rdma pointer back to zero. Will retry later */
    opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval,
            endpoint->eager_rdma_local.base.pval, NULL);
    endpoint->eager_rdma_local.frags = NULL;
}