int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, mca_btl_openib_get_frag_t *frag) { int qp = to_base_frag(frag)->base.order; struct ibv_send_wr *bad_wr; /* check for a send wqe */ if (qp_get_wqe(ep, qp) < 0) { qp_put_wqe(ep, qp); return OPAL_ERR_OUT_OF_RESOURCE; } /* check for a get token */ if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { qp_put_wqe(ep, qp); OPAL_THREAD_ADD32(&ep->get_tokens,1); return OPAL_ERR_OUT_OF_RESOURCE; } qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); qp_reset_signal_count(ep, qp); if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) { qp_put_wqe(ep, qp); OPAL_THREAD_ADD32(&ep->get_tokens,1); return OPAL_ERROR; } return OPAL_SUCCESS; }
static void recv_constructor(mca_btl_wv_recv_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); base_frag->type = MCA_BTL_WV_FRAG_RECV; frag->hdr = (mca_btl_wv_header_t*)base_frag->base.super.ptr; base_frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_wv_header_t); to_com_frag(frag)->sg_entry.pAddress = (void*)(uintptr_t)frag->hdr; frag->rd_desc.wr_id = (uint64_t)(uintptr_t)frag; frag->rd_desc.sg_list = (WV_SGE*)&to_com_frag(frag)->sg_entry; frag->rd_desc.num_sge = 1; frag->rd_desc.next = NULL; }
static void send_control_constructor(mca_btl_openib_send_control_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_CONTROL; /* adjusting headers because there is no coalesce header in control messages */ frag->hdr = frag->chdr; to_base_frag(frag)->segment.seg_addr.pval = frag->hdr + 1; to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t)frag->hdr; }
/* send the eager rdma connect message to the remote endpoint */ static int mca_btl_openib_endpoint_send_eager_rdma( mca_btl_base_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_eager_rdma_header_t *rdma_hdr; mca_btl_openib_send_control_frag_t* frag; int rc; frag = alloc_control_frag(openib_btl); if(NULL == frag) { return -1; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_eager_rdma_connect_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t); to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_IB; rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval; rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA; rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey; rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval); BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n", rdma_hdr->rkey, rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival, rdma_hdr->control.type, (int) sizeof(mca_btl_openib_eager_rdma_header_t) )); if(endpoint->nbo) { BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr)); BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n", rdma_hdr->rkey, rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival )); } rc = mca_btl_openib_endpoint_send(endpoint, frag); if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) return OPAL_SUCCESS; MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno))); return rc; }
static void get_constructor(mca_btl_wv_get_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_WV_FRAG_RECV_USER; frag->sr_desc.WrId = (uint64_t)(uintptr_t)frag; frag->sr_desc.pSgl = (WV_SGE*)&to_com_frag(frag)->sg_entry; frag->sr_desc.nSge = 1; frag->sr_desc.Opcode = WvRdmaRead; frag->sr_desc.Flags = WV_SEND_SIGNALED; frag->sr_desc.pNext = NULL; }
static void get_constructor(mca_btl_openib_get_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_RECV_USER; frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag; frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; frag->sr_desc.num_sge = 1; frag->sr_desc.opcode = IBV_WR_RDMA_READ; frag->sr_desc.send_flags = IBV_SEND_SIGNALED; frag->sr_desc.next = NULL; }
static void send_constructor(mca_btl_wv_send_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); base_frag->type = MCA_BTL_WV_FRAG_SEND; frag->chdr = (mca_btl_wv_header_t*)base_frag->base.super.ptr; frag->hdr = (mca_btl_wv_header_t*) (((unsigned char*)base_frag->base.super.ptr) + sizeof(mca_btl_wv_header_coalesced_t) + sizeof(mca_btl_wv_control_header_t)); base_frag->segment.seg_addr.pval = frag->hdr + 1; to_com_frag(frag)->sg_entry.pAddress = (void*)(uintptr_t)frag->hdr; frag->coalesced_length = 0; OBJ_CONSTRUCT(&frag->coalesced_frags, opal_list_t); }
static void out_constructor(mca_btl_wv_out_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); base_frag->base.des_src = &base_frag->segment; base_frag->base.des_src_cnt = 1; base_frag->base.des_dst = NULL; base_frag->base.des_dst_cnt = 0; frag->sr_desc.WrId = (uint64_t)(uintptr_t)frag; frag->sr_desc.pSgl = (WV_SGE*)&to_com_frag(frag)->sg_entry; frag->sr_desc.nSge = 1; frag->sr_desc.Opcode = WvSend; frag->sr_desc.Flags = WV_SEND_SIGNALED; frag->sr_desc.pNext = NULL; }
static void out_constructor(mca_btl_openib_out_frag_t *frag) { mca_btl_openib_frag_t *base_frag = to_base_frag(frag); base_frag->base.des_src = &base_frag->segment; base_frag->base.des_src_cnt = 1; base_frag->base.des_dst = NULL; base_frag->base.des_dst_cnt = 0; frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag; frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; frag->sr_desc.num_sge = 1; frag->sr_desc.opcode = IBV_WR_SEND; frag->sr_desc.send_flags = IBV_SEND_SIGNALED; frag->sr_desc.next = NULL; }
void mca_btl_wv_frag_init(ompi_free_list_item_t* item, void* ctx) { mca_btl_wv_frag_init_data_t* init_data = (mca_btl_wv_frag_init_data_t *) ctx; mca_btl_wv_frag_t *frag = to_base_frag(item); if(MCA_BTL_WV_FRAG_RECV == frag->type) { to_recv_frag(frag)->qp_idx = init_data->order; to_com_frag(frag)->sg_entry.Length = mca_btl_wv_component.qp_infos[init_data->order].size + sizeof(mca_btl_wv_header_t) + sizeof(mca_btl_wv_header_coalesced_t) + sizeof(mca_btl_wv_control_header_t); } if(MCA_BTL_WV_FRAG_SEND == frag->type) to_send_frag(frag)->qp_idx = init_data->order; frag->list = init_data->list; }
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_openib_get_frag_t* frag = NULL; int qp = order; int rc; if (OPAL_UNLIKELY(size > btl->btl_get_limit)) { return OPAL_ERR_BAD_PARAM; } frag = to_get_frag(alloc_recv_user_frag()); if (OPAL_UNLIKELY(NULL == frag)) { return OPAL_ERR_OUT_OF_RESOURCE; } if (MCA_BTL_NO_ORDER == qp) { qp = mca_btl_openib_component.rdma_qp; } /* set base descriptor flags */ to_base_frag(frag)->base.order = qp; /* free this descriptor when the operation is complete */ to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; /* set up scatter-gather entry */ to_com_frag(frag)->sg_entry.length = size; to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; to_com_frag(frag)->endpoint = ep; /* set up rdma callback */ frag->cb.func = cbfunc; frag->cb.context = cbcontext; frag->cb.data = cbdata; frag->cb.local_handle = local_handle; /* set up descriptor */ frag->sr_desc.wr.rdma.remote_addr = remote_address; /* the opcode may have been changed by an atomic operation */ frag->sr_desc.opcode = IBV_WR_RDMA_READ; #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey); } else #endif { frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; } #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { #if OPAL_HAVE_CONNECTX_XRC_DOMAINS frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; #else frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; #endif } #endif if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); if (OPAL_ERR_RESOURCE_BUSY == rc) { return OPAL_SUCCESS; } if (OPAL_SUCCESS != rc) { MCA_BTL_IB_FRAG_RETURN (frag); return rc; } } rc = mca_btl_openib_get_internal (btl, ep, frag); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { rc = OPAL_SUCCESS; OPAL_THREAD_LOCK(&ep->endpoint_lock); opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); } else { MCA_BTL_IB_FRAG_RETURN (frag); } } return rc; }
/** * This function is used to send a message to the remote side * indicating the endpoint is broken and telling the remote side to * brings its endpoint down as well. This is needed because there are * cases where only one side of the connection determines that the * there was a problem. * @param endpoint Pointer to endpoint with error * @param type Type of message to be sent, can be one of two types * @param index When sending RDMA error message, index is non zero */ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_module_t* newbtl = NULL; bool found = false; mca_btl_openib_broken_connection_header_t *bc_hdr; mca_btl_openib_send_control_frag_t* frag; mca_btl_base_endpoint_t* newep; int i, rc; opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal; /* First, find a different BTL than this one that got the * error to send the message over. */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { if (mca_btl_openib_component.openib_btls[i] != openib_btl) { newbtl = mca_btl_openib_component.openib_btls[i]; break; } } if (NULL == newbtl) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No BTL found"); /* If we cannot find one, then just return. */ return; } /* Now, find the endpoint associated with it. The device * associated with the BTL has the list of all the * endpoints. */ for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) { newep = (mca_btl_openib_endpoint_t*) opal_pointer_array_get_item(newbtl->device->endpoints, i); if (NULL == newep) { continue; } if (newep->endpoint_proc->proc_opal == remote_proc) { found = true; break; } } if (false == found) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No endpoint found"); /* If we cannot find a match, then just return. */ return; } frag = alloc_control_frag(newbtl); if(NULL == frag) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No frag space"); /* If no frag available, then just return. */ return; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_notify_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.base.seg_len = sizeof(mca_btl_openib_broken_connection_header_t); to_com_frag(frag)->endpoint = newep; frag->hdr->tag = MCA_BTL_TAG_IB; bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval; bc_hdr->control.type = type; bc_hdr->lid = endpoint->endpoint_btl->port_info.lid; bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id; bc_hdr->vpid = opal_process_name_vpid(OPAL_PROC_MY_NAME); bc_hdr->index = index; if(newep->nbo) { BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr)); } rc = mca_btl_openib_endpoint_send(newep, frag); if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) { return; } MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno))); return; }
/* Setup eager RDMA buffers and notify the remote endpoint*/ void mca_btl_openib_endpoint_connect_eager_rdma( mca_btl_openib_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; char *buf; mca_btl_openib_recv_frag_t *headers_buf; int i; uint32_t flag = MCA_MPOOL_FLAGS_CACHE_BYPASS; /* Set local rdma pointer to 1 temporarily so other threads will not try * to enter the function */ if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) return; headers_buf = (mca_btl_openib_recv_frag_t*) malloc(sizeof(mca_btl_openib_recv_frag_t) * mca_btl_openib_component.eager_rdma_num); if(NULL == headers_buf) goto unlock_rdma_local; #if HAVE_DECL_IBV_ACCESS_SO /* Solaris implements the Relaxed Ordering feature defined in the PCI Specification. With this in mind any memory region which relies on a buffer being written in a specific order, for example the eager rdma connections created in this routinue, must set a strong order flag when registering the memory for rdma operations. The following flag will be interpreted and the appropriate steps will be taken when the memory is registered in openib_reg_mr(). */ flag |= MCA_MPOOL_FLAGS_SO_MEM; #endif buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool, openib_btl->eager_rdma_frag_size * mca_btl_openib_component.eager_rdma_num, mca_btl_openib_component.buffer_alignment, flag, (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) goto free_headers_buf; buf = buf + openib_btl->eager_rdma_frag_size - sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit - sizeof(mca_btl_openib_header_t); for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) { opal_free_list_item_t *item; mca_btl_openib_recv_frag_t * frag; mca_btl_openib_frag_init_data_t init_data; item = (opal_free_list_item_t*)&headers_buf[i]; item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg; item->ptr = buf + i * openib_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t); init_data.order = mca_btl_openib_component.credits_qp; init_data.list = NULL; mca_btl_openib_frag_init(item, &init_data); frag = to_recv_frag(item); to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; to_com_frag(frag)->endpoint = endpoint; frag->ftr = (mca_btl_openib_footer_t*) ((char*)to_base_frag(frag)->segment.seg_addr.pval + mca_btl_openib_component.eager_limit); MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr); } endpoint->eager_rdma_local.frags = headers_buf; endpoint->eager_rdma_local.rd_win = mca_btl_openib_component.eager_rdma_num >> 2; endpoint->eager_rdma_local.rd_win = endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1; /* set local rdma pointer to real value */ (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1, buf); if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == OPAL_SUCCESS) { mca_btl_openib_device_t *device = endpoint->endpoint_btl->device; mca_btl_openib_endpoint_t **p; OBJ_RETAIN(endpoint); assert(((opal_object_t*)endpoint)->obj_reference_count == 2); do { p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count]; } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint)); OPAL_THREAD_ADD32(&openib_btl->eager_rdma_channels, 1); /* from this point progress function starts to poll new buffer */ OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1); return; } openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); free_headers_buf: free(headers_buf); unlock_rdma_local: /* set local rdma pointer back to zero. Will retry later */ (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, endpoint->eager_rdma_local.base.pval, NULL); endpoint->eager_rdma_local.frags = NULL; }
void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, const int qp) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_send_control_frag_t* frag; mca_btl_openib_rdma_credits_header_t *credits_hdr; int rc; bool do_rdma = false; int32_t cm_return; frag = endpoint->qps[qp].credit_frag; if(OPAL_UNLIKELY(NULL == frag)) { frag = alloc_control_frag(openib_btl); frag->qp_idx = qp; endpoint->qps[qp].credit_frag = frag; /* set those once and forever */ to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;; to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_IB; to_base_frag(frag)->segment.seg_len = sizeof(mca_btl_openib_rdma_credits_header_t); } assert(frag->qp_idx == qp); credits_hdr = (mca_btl_openib_rdma_credits_header_t*) to_base_frag(frag)->segment.seg_addr.pval; if(OPAL_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) { do_rdma = true; } else { if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) > (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); return; } } BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); frag->hdr->cm_seen = 0; BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); if(cm_return > 255) { frag->hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { frag->hdr->cm_seen = cm_return; } BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); credits_hdr->qpn = qp; credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS; if(endpoint->nbo) BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr); qp_reset_signal_count(endpoint, qp); if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0) return; if(endpoint->nbo) { BTL_OPENIB_HEADER_NTOH(*frag->hdr); BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr); } BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); if(do_rdma) OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); else OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_ERROR(("error posting send request errno %d says %s", rc, strerror(errno))); }
/* Setup eager RDMA buffers and notify the remote endpoint*/ void mca_btl_wv_endpoint_connect_eager_rdma( mca_btl_wv_endpoint_t* endpoint) { mca_btl_wv_module_t* wv_btl = endpoint->endpoint_btl; char *buf; mca_btl_wv_recv_frag_t *headers_buf; int i; /* Set local rdma pointer to 1 temporarily so other threads will not try * to enter the function */ if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) return; headers_buf = (mca_btl_wv_recv_frag_t*) malloc(sizeof(mca_btl_wv_recv_frag_t) * mca_btl_wv_component.eager_rdma_num); if(NULL == headers_buf) goto unlock_rdma_local; buf = (char *) wv_btl->super.btl_mpool->mpool_alloc(wv_btl->super.btl_mpool, wv_btl->eager_rdma_frag_size * mca_btl_wv_component.eager_rdma_num, mca_btl_wv_component.buffer_alignment, MCA_MPOOL_FLAGS_CACHE_BYPASS, (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) goto free_headers_buf; buf = buf + wv_btl->eager_rdma_frag_size - sizeof(mca_btl_wv_footer_t) - wv_btl->super.btl_eager_limit - sizeof(mca_btl_wv_header_t); for(i = 0; i < mca_btl_wv_component.eager_rdma_num; i++) { ompi_free_list_item_t *item; mca_btl_wv_recv_frag_t * frag; mca_btl_wv_frag_init_data_t init_data; item = (ompi_free_list_item_t*)&headers_buf[i]; item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg; item->ptr = buf + i * wv_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_wv_recv_frag_t); init_data.order = mca_btl_wv_component.credits_qp; init_data.list = NULL; mca_btl_wv_frag_init(item, &init_data); frag = to_recv_frag(item); to_base_frag(frag)->type = MCA_BTL_WV_FRAG_EAGER_RDMA; to_com_frag(frag)->endpoint = endpoint; frag->ftr = (mca_btl_wv_footer_t*) ((char*)to_base_frag(frag)->segment.base.seg_addr.pval + mca_btl_wv_component.eager_limit); MCA_BTL_WV_RDMA_MAKE_REMOTE(frag->ftr); } endpoint->eager_rdma_local.frags = headers_buf; endpoint->eager_rdma_local.rd_win = mca_btl_wv_component.eager_rdma_num >> 2; endpoint->eager_rdma_local.rd_win = endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1; /* set local rdma pointer to real value */ opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1, buf); if(mca_btl_wv_endpoint_send_eager_rdma(endpoint) == OMPI_SUCCESS) { mca_btl_wv_device_t *device = endpoint->endpoint_btl->device; mca_btl_wv_endpoint_t **p; OBJ_RETAIN(endpoint); assert(((opal_object_t*)endpoint)->obj_reference_count == 2); do { p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count]; } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint)); OPAL_THREAD_ADD32(&wv_btl->eager_rdma_channels, 1); /* from this point progress function starts to poll new buffer */ OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1); return; } wv_btl->super.btl_mpool->mpool_free(wv_btl->super.btl_mpool, buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); free_headers_buf: free(headers_buf); unlock_rdma_local: /* set local rdma pointer back to zero. Will retry later */ opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, endpoint->eager_rdma_local.base.pval, NULL); endpoint->eager_rdma_local.frags = NULL; }