/* * Function used for debugging problems in eager rdma. */ static void dump_local_rdma_frags(mca_btl_openib_endpoint_t * endpoint) { mca_btl_openib_recv_frag_t *headers_buf = endpoint->eager_rdma_local.frags; mca_btl_openib_recv_frag_t * frag; mca_btl_openib_control_header_t* chdr; int i, size; opal_output(0, "Head = %d", endpoint->eager_rdma_local.head); for (i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) { frag = &headers_buf[i]; size = MCA_BTL_OPENIB_RDMA_FRAG_GET_SIZE(frag->ftr); frag->hdr = (mca_btl_openib_header_t*)(((char*)frag->ftr) - size + sizeof(mca_btl_openib_footer_t)); to_base_frag(frag)->segment.base.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_openib_header_t); chdr = to_base_frag(frag)->segment.base.seg_addr.pval; if ((MCA_BTL_TAG_IB == frag->hdr->tag) && (MCA_BTL_OPENIB_CONTROL_CREDITS == chdr->type)) { opal_output(0, "tag[%d] is credit message", i); } else { opal_output(0, "frag[%d] size=%d,tag=%d,ftr->u.buf=%d", i, size, frag->hdr->tag, frag->ftr->u.buf[3]); } } }
static int acquire_send_credit(mca_btl_wv_endpoint_t *endpoint, mca_btl_wv_send_frag_t *frag) { mca_btl_wv_module_t *wv_btl = endpoint->endpoint_btl; int qp = to_base_frag(frag)->base.order; int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY); if(BTL_WV_QP_TYPE_PP(qp)) { if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio], (opal_list_item_t *)frag); return OMPI_ERR_OUT_OF_RESOURCE; } } else { if(OPAL_THREAD_ADD32(&wv_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) { OPAL_THREAD_ADD32(&wv_btl->qps[qp].u.srq_qp.sd_credits, 1); OPAL_THREAD_LOCK(&wv_btl->ib_lock); opal_list_append(&wv_btl->qps[qp].u.srq_qp.pending_frags[prio], (opal_list_item_t *)frag); OPAL_THREAD_UNLOCK(&wv_btl->ib_lock); return OMPI_ERR_OUT_OF_RESOURCE; } } return OMPI_SUCCESS; }
static void send_control_constructor(mca_btl_openib_send_control_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_CONTROL; /* adjusting headers because there is no coalesce header in control messages */ frag->hdr = frag->chdr; to_base_frag(frag)->segment.seg_addr.pval = frag->hdr + 1; to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t)frag->hdr; }
static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep, mca_btl_openib_send_frag_t *frag) { int qp = to_base_frag(frag)->base.order; int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY); if(qp_get_wqe(ep, qp) < 0) { qp_put_wqe(ep, qp); opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio], (opal_list_item_t *)frag); return OPAL_ERR_OUT_OF_RESOURCE; } return OPAL_SUCCESS; }
int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, mca_btl_openib_get_frag_t *frag) { int qp = to_base_frag(frag)->base.order; struct ibv_send_wr *bad_wr; /* check for a send wqe */ if (qp_get_wqe(ep, qp) < 0) { qp_put_wqe(ep, qp); return OPAL_ERR_OUT_OF_RESOURCE; } /* check for a get token */ if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { qp_put_wqe(ep, qp); OPAL_THREAD_ADD32(&ep->get_tokens,1); return OPAL_ERR_OUT_OF_RESOURCE; } qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); qp_reset_signal_count(ep, qp); if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) { qp_put_wqe(ep, qp); OPAL_THREAD_ADD32(&ep->get_tokens,1); return OPAL_ERROR; } return OPAL_SUCCESS; }
static void in_constructor(mca_btl_wv_in_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); base_frag->base.des_dst = &base_frag->segment; base_frag->base.des_dst_cnt = 1; base_frag->base.des_src = NULL; base_frag->base.des_src_cnt = 0; }
/* this function is called with endpoint->endpoint_lock held */ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag) { int prio = to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY; mca_btl_openib_header_t *hdr = frag->hdr; mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; int qp, ib_rc, rc; bool do_rdma = false; size_t size; if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) des->order = frag->qp_idx; qp = des->order; if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS) return OPAL_ERR_RESOURCE_BUSY; size = des->des_segments->seg_len + frag->coalesced_length; rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size, &do_rdma, frag, true); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { qp_put_wqe(endpoint, qp); return OPAL_ERR_RESOURCE_BUSY; } qp_reset_signal_count(endpoint, qp); ib_rc = post_send(endpoint, frag, do_rdma, 1); if(!ib_rc) return OPAL_SUCCESS; if(endpoint->nbo) BTL_OPENIB_HEADER_NTOH(*hdr); mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag); qp_put_wqe(endpoint, qp); BTL_ERROR(("error posting send request error %d: %s. size = %lu\n", ib_rc, strerror(ib_rc), size)); return OPAL_ERROR; }
static void get_constructor(mca_btl_openib_get_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_RECV_USER; frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag; frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; frag->sr_desc.num_sge = 1; frag->sr_desc.opcode = IBV_WR_RDMA_READ; frag->sr_desc.send_flags = IBV_SEND_SIGNALED; frag->sr_desc.next = NULL; }
static void get_constructor(mca_btl_wv_get_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_WV_FRAG_RECV_USER; frag->sr_desc.WrId = (uint64_t)(uintptr_t)frag; frag->sr_desc.pSgl = (WV_SGE*)&to_com_frag(frag)->sg_entry; frag->sr_desc.nSge = 1; frag->sr_desc.Opcode = WvRdmaRead; frag->sr_desc.Flags = WV_SEND_SIGNALED; frag->sr_desc.pNext = NULL; }
static void coalesced_constructor(mca_btl_wv_coalesced_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); base_frag->type = MCA_BTL_WV_FRAG_COALESCED; base_frag->base.des_src = &base_frag->segment; base_frag->base.des_src_cnt = 1; base_frag->base.des_dst = NULL; base_frag->base.des_dst_cnt = 0; }
static void com_constructor(mca_btl_wv_com_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); mca_btl_wv_reg_t* reg = (mca_btl_wv_reg_t*)base_frag->base.super.registration; frag->registration = reg; if(reg) { frag->sg_entry.Lkey = reg->mr->lkey; base_frag->segment.seg_key.key32[0] = reg->mr->lkey; } }
static void out_constructor(mca_btl_openib_out_frag_t *frag) { mca_btl_openib_frag_t *base_frag = to_base_frag(frag); base_frag->base.des_src = &base_frag->segment; base_frag->base.des_src_cnt = 1; base_frag->base.des_dst = NULL; base_frag->base.des_dst_cnt = 0; frag->sr_desc.wr_id = (uint64_t)(uintptr_t)frag; frag->sr_desc.sg_list = &to_com_frag(frag)->sg_entry; frag->sr_desc.num_sge = 1; frag->sr_desc.opcode = IBV_WR_SEND; frag->sr_desc.send_flags = IBV_SEND_SIGNALED; frag->sr_desc.next = NULL; }
/* send the eager rdma connect message to the remote endpoint */ static int mca_btl_openib_endpoint_send_eager_rdma( mca_btl_base_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_eager_rdma_header_t *rdma_hdr; mca_btl_openib_send_control_frag_t* frag; int rc; frag = alloc_control_frag(openib_btl); if(NULL == frag) { return -1; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_eager_rdma_connect_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.seg_len = sizeof(mca_btl_openib_eager_rdma_header_t); to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_IB; rdma_hdr = (mca_btl_openib_eager_rdma_header_t*)to_base_frag(frag)->segment.seg_addr.pval; rdma_hdr->control.type = MCA_BTL_OPENIB_CONTROL_RDMA; rdma_hdr->rkey = endpoint->eager_rdma_local.reg->mr->rkey; rdma_hdr->rdma_start.lval = opal_ptr_ptol(endpoint->eager_rdma_local.base.pval); BTL_VERBOSE(("sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 " type %d and sizeof(rdma_hdr) %d\n", rdma_hdr->rkey, rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival, rdma_hdr->control.type, (int) sizeof(mca_btl_openib_eager_rdma_header_t) )); if(endpoint->nbo) { BTL_OPENIB_EAGER_RDMA_CONTROL_HEADER_HTON((*rdma_hdr)); BTL_VERBOSE(("after HTON: sending rkey %" PRIu32 ", rdma_start.lval %" PRIx64 ", pval %p, ival %" PRIu32 "\n", rdma_hdr->rkey, rdma_hdr->rdma_start.lval, rdma_hdr->rdma_start.pval, rdma_hdr->rdma_start.ival )); } rc = mca_btl_openib_endpoint_send(endpoint, frag); if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) return OPAL_SUCCESS; MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending RDMA buffer: %s", strerror(errno))); return rc; }
static void recv_constructor(mca_btl_wv_recv_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); base_frag->type = MCA_BTL_WV_FRAG_RECV; frag->hdr = (mca_btl_wv_header_t*)base_frag->base.super.ptr; base_frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_wv_header_t); to_com_frag(frag)->sg_entry.pAddress = (void*)(uintptr_t)frag->hdr; frag->rd_desc.wr_id = (uint64_t)(uintptr_t)frag; frag->rd_desc.sg_list = (WV_SGE*)&to_com_frag(frag)->sg_entry; frag->rd_desc.num_sge = 1; frag->rd_desc.next = NULL; }
static void out_constructor(mca_btl_wv_out_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); base_frag->base.des_src = &base_frag->segment; base_frag->base.des_src_cnt = 1; base_frag->base.des_dst = NULL; base_frag->base.des_dst_cnt = 0; frag->sr_desc.WrId = (uint64_t)(uintptr_t)frag; frag->sr_desc.pSgl = (WV_SGE*)&to_com_frag(frag)->sg_entry; frag->sr_desc.nSge = 1; frag->sr_desc.Opcode = WvSend; frag->sr_desc.Flags = WV_SEND_SIGNALED; frag->sr_desc.pNext = NULL; }
static void send_constructor(mca_btl_wv_send_frag_t *frag) { mca_btl_wv_frag_t *base_frag = to_base_frag(frag); base_frag->type = MCA_BTL_WV_FRAG_SEND; frag->chdr = (mca_btl_wv_header_t*)base_frag->base.super.ptr; frag->hdr = (mca_btl_wv_header_t*) (((unsigned char*)base_frag->base.super.ptr) + sizeof(mca_btl_wv_header_coalesced_t) + sizeof(mca_btl_wv_control_header_t)); base_frag->segment.seg_addr.pval = frag->hdr + 1; to_com_frag(frag)->sg_entry.pAddress = (void*)(uintptr_t)frag->hdr; frag->coalesced_length = 0; OBJ_CONSTRUCT(&frag->coalesced_frags, opal_list_t); }
/* * Attempt to send a fragment using a given endpoint. If the endpoint is not * connected, queue the fragment and start the connection as required. */ int mca_btl_openib_endpoint_send(mca_btl_base_endpoint_t* ep, mca_btl_openib_send_frag_t* frag) { int rc; OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_lazy_frags); if(OPAL_LIKELY(OPAL_SUCCESS == rc)) { rc = mca_btl_openib_endpoint_post_send(ep, frag); } OPAL_THREAD_UNLOCK(&ep->endpoint_lock); if (OPAL_UNLIKELY(OPAL_ERR_RESOURCE_BUSY == rc)) { rc = OPAL_SUCCESS; } return rc; }
void mca_btl_wv_frag_init(ompi_free_list_item_t* item, void* ctx) { mca_btl_wv_frag_init_data_t* init_data = (mca_btl_wv_frag_init_data_t *) ctx; mca_btl_wv_frag_t *frag = to_base_frag(item); if(MCA_BTL_WV_FRAG_RECV == frag->type) { to_recv_frag(frag)->qp_idx = init_data->order; to_com_frag(frag)->sg_entry.Length = mca_btl_wv_component.qp_infos[init_data->order].size + sizeof(mca_btl_wv_header_t) + sizeof(mca_btl_wv_header_coalesced_t) + sizeof(mca_btl_wv_control_header_t); } if(MCA_BTL_WV_FRAG_SEND == frag->type) to_send_frag(frag)->qp_idx = init_data->order; frag->list = init_data->list; }
/** * This function is called when we get an error on the completion * event of a fragment. We check to see what type of fragment it is * and act accordingly. In most cases, we first call up into the PML * and have it map out this connection for any future communication. * In addition, this function will possibly send some control messages * over the other openib BTL. The first control message will tell the * remote side to also map out this connection. The second control * message makes sure the eager RDMA connection remains in a sane * state. See that function for more details. * @param openib_btl Pointer to BTL that had the error * @param des Pointer to descriptor that had the error * @param qp Queue pair that had the error * @param remote_proc Pointer to process that had the error * @param endpoint Pointer to endpoint that had the error */ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, mca_btl_base_descriptor_t *des, int qp, opal_proc_t* remote_proc, mca_btl_openib_endpoint_t* endpoint) { char *btlname = NULL; int btl_ownership; /* Since this BTL supports failover, it will call the PML error handler * function with the NONFATAL flag. If the PML is running with failover * support, then it will map out the endpoint for further communication * and return control here. If the PML does not have failover support, * it will abort the job and control will not return here. */ /* Note: At this point, what needs to be done is based on the type * of openib fragment that got the error. Also note that in the wc * struct, when wc->status != IBV_WC_SUCCESS, these are the only * valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num. * This means that one cannot key off of the wc->opcode to see what * operation was done. The important information needs to be read * from the fragment. */ /* Cannot issue callback to SRQ errors because the shared receive * queue is shared and is not specific to a connection. There is no * way to figure out what type of message created the error because * we need the information in the wc->imm_data field which does not * exist when we have an error. So, nothing to do here but return. */ if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "SRQ RECV type=%d", openib_frag_type(des)); /* Need to think about returning any shared resources of the * SRQ. For now, we do nothing as we rarely see an error on * the SRQ. */ return; } assert(NULL != remote_proc); /* Create a nice string to help with debug */ if (NULL != openib_btl) { asprintf(&btlname, "lid=%d:name=%s", openib_btl->lid, openib_btl->device->ib_dev->name); } /* The next set of errors are associated with an endpoint, but not * with a PML descriptor. They are not associated with a PML * descriptor because: * A. It was a receive * B. It was some type of openib specific control message. * Therefore, just drop the fragments and call up into the PML to * disable this endpoint for future communication. */ if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) || (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) || (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) { openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); /* Now that this connection has been mapped out at the PML layer, * we change the state in the BTL layer. The change in the PML * layer should prevent that we ever try to send on this BTL * again. If we do, then this is an error case. */ if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { endpoint->endpoint_state = MCA_BTL_IB_FAILED; mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); error_out_all_pending_frags(endpoint, &openib_btl->super, true); } opal_output_verbose(60, mca_btl_openib_component.verbose_failover, "MCA_BTL_OPENIG_FRAG=%d, " "dropping since connection is broken (des=%lx)", openib_frag_type(des), (long unsigned int) des); if (NULL != btlname) free(btlname); return; } /* These are RDMA read type fragments. Just continue with processing */ if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) { OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "OPENIB_FRAG_RECV_USER fragment, " "btl=%lx, continue with callbacks", (long unsigned int) &openib_btl->super); } /* If we are at this point, we have completed a send, RDMA read or * RDMA write. Call the PML callback function to map out this * btl for further sending. We just call this every time we get an * error even though it is not necessary. Subsequent calls with * the same remote_proc argument will not actually map anything out. */ openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); if (NULL != btlname) free(btlname); /* Since we believe we have done a send, read or write, then the * des_local fields should have valid data. */ assert(des->des_local != NULL); /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then * change the status. Since this connection was mapped out in the * PML layer, no more attempts should be made to send on it. In * addition, send a message to other end of the connection letting * it know that this side is now broken. This is needed in the case * of a spurious error which may not cause the remote side to detect * the error. */ if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { endpoint->endpoint_state = MCA_BTL_IB_FAILED; mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); } /* Now, call the callback function associated with the fragment. * In case the fragments were coalesced we need to pull them apart * and call the callback function for each one. */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base); } } } /* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER * or MCA_BTL_OPENIB_FRAG_RECV_USER. */ btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, des); } /* Here we send another control message to notify the remote side * we had an error on a eager fragment. A non-zero value for the * ftr variable indicates that this was an eager RDMA fragment. * We need to do this in case the eager RDMA fragment after this * one actually made it successfully. */ if (0 != to_send_frag(des)->ftr) { mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR, (long)to_send_frag(des)->ftr - 1); } /* We know we have completed a send so return some resources even * though connection is broken. With SRQ, the resources are shared * so if we do not return the credits we may not be allowed to send * anymore. */ qp_put_wqe(endpoint, qp); if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); } /* There are several queues associated with an endpoint that may * have some unsent fragments sitting in them. Remove them and * call the callback functions with an error so the PML can send * them down a different path. This really only needs to be called * once on an endpoint, but for now, just call it a bunch of times. * The first time through will remove the unsent fragments so * subsequent calls are no-ops. */ if (endpoint) { error_out_all_pending_frags(endpoint, &openib_btl->super, true); } }
/** * This function will find all the pending fragments on an endpoint * and call the callback function with OPAL_ERROR. It walks through * each qp with each priority and looks for both no_credits_pending_frags * and no_wqe_pending_frags. It then looks for any pending_lazy_frags, * pending_put_frags, and pending_get_frags. This function is only * called when running with failover support enabled. Note that * the errout parameter allows the function to also be used as a * debugging tool to see if there are any fragments on any of the * queues. * @param ep Pointer to endpoint that had error * @param module Pointer to module that had error * @param errout Boolean which says whether to error them out or not */ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep, struct mca_btl_base_module_t* module, bool errout) { int qp, pri, len, total, btl_ownership; opal_list_item_t *item; mca_btl_openib_com_frag_t* frag; mca_btl_base_descriptor_t *des; int verbose = 10; /* Verbosity level unless debugging */ /* If debugging, drop verbosity level so we can see the output * regardless of the level the program was run with. */ if (false == errout) { verbose = 0; } total = 0; /* Traverse all QPs and all priorities and move to other endpoint */ for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { for (pri = 0; pri < 2; ++pri) { /* All types of qp's have a no_wqe_pending_frags list */ len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for no_wqe_pending_frags qp=%d, " "pri=%d, list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->qps[qp]. no_wqe_pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in no_wqe_pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } if (BTL_OPENIB_QP_TYPE_PP(qp)) { len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for no_credits_pending_frags qp=%d, " "pri=%d, list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->qps[qp]. no_credits_pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in " "no_credits_pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } } else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for srq pending_frags qp=%d, pri=%d, " "list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp]. u.srq_qp.pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in SRQ pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } } } } /* Check for any frags from a connection that was never made. Not sure if this * can actually happen. */ len = opal_list_get_size(&ep->pending_lazy_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_lazy_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } len = opal_list_get_size(&ep->pending_put_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_put_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } len = opal_list_get_size(&ep->pending_get_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_get_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover, "IB: Finished checking for pending_frags, total moved=%d", total); }
void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, const int qp) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_send_control_frag_t* frag; mca_btl_openib_rdma_credits_header_t *credits_hdr; int rc; bool do_rdma = false; int32_t cm_return; frag = endpoint->qps[qp].credit_frag; if(OPAL_UNLIKELY(NULL == frag)) { frag = alloc_control_frag(openib_btl); frag->qp_idx = qp; endpoint->qps[qp].credit_frag = frag; /* set those once and forever */ to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;; to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_IB; to_base_frag(frag)->segment.seg_len = sizeof(mca_btl_openib_rdma_credits_header_t); } assert(frag->qp_idx == qp); credits_hdr = (mca_btl_openib_rdma_credits_header_t*) to_base_frag(frag)->segment.seg_addr.pval; if(OPAL_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) { do_rdma = true; } else { if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) > (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); return; } } BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); frag->hdr->cm_seen = 0; BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); if(cm_return > 255) { frag->hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { frag->hdr->cm_seen = cm_return; } BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); credits_hdr->qpn = qp; credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS; if(endpoint->nbo) BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr); qp_reset_signal_count(endpoint, qp); if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0) return; if(endpoint->nbo) { BTL_OPENIB_HEADER_NTOH(*frag->hdr); BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr); } BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); if(do_rdma) OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); else OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_ERROR(("error posting send request errno %d says %s", rc, strerror(errno))); }
/* Setup eager RDMA buffers and notify the remote endpoint*/ void mca_btl_openib_endpoint_connect_eager_rdma( mca_btl_openib_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; char *buf; mca_btl_openib_recv_frag_t *headers_buf; int i; uint32_t flag = MCA_MPOOL_FLAGS_CACHE_BYPASS; /* Set local rdma pointer to 1 temporarily so other threads will not try * to enter the function */ if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) return; headers_buf = (mca_btl_openib_recv_frag_t*) malloc(sizeof(mca_btl_openib_recv_frag_t) * mca_btl_openib_component.eager_rdma_num); if(NULL == headers_buf) goto unlock_rdma_local; #if HAVE_DECL_IBV_ACCESS_SO /* Solaris implements the Relaxed Ordering feature defined in the PCI Specification. With this in mind any memory region which relies on a buffer being written in a specific order, for example the eager rdma connections created in this routinue, must set a strong order flag when registering the memory for rdma operations. The following flag will be interpreted and the appropriate steps will be taken when the memory is registered in openib_reg_mr(). */ flag |= MCA_MPOOL_FLAGS_SO_MEM; #endif buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool, openib_btl->eager_rdma_frag_size * mca_btl_openib_component.eager_rdma_num, mca_btl_openib_component.buffer_alignment, flag, (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) goto free_headers_buf; buf = buf + openib_btl->eager_rdma_frag_size - sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit - sizeof(mca_btl_openib_header_t); for(i = 0; i < mca_btl_openib_component.eager_rdma_num; i++) { opal_free_list_item_t *item; mca_btl_openib_recv_frag_t * frag; mca_btl_openib_frag_init_data_t init_data; item = (opal_free_list_item_t*)&headers_buf[i]; item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg; item->ptr = buf + i * openib_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t); init_data.order = mca_btl_openib_component.credits_qp; init_data.list = NULL; mca_btl_openib_frag_init(item, &init_data); frag = to_recv_frag(item); to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_EAGER_RDMA; to_com_frag(frag)->endpoint = endpoint; frag->ftr = (mca_btl_openib_footer_t*) ((char*)to_base_frag(frag)->segment.seg_addr.pval + mca_btl_openib_component.eager_limit); MCA_BTL_OPENIB_RDMA_MAKE_REMOTE(frag->ftr); } endpoint->eager_rdma_local.frags = headers_buf; endpoint->eager_rdma_local.rd_win = mca_btl_openib_component.eager_rdma_num >> 2; endpoint->eager_rdma_local.rd_win = endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1; /* set local rdma pointer to real value */ (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1, buf); if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == OPAL_SUCCESS) { mca_btl_openib_device_t *device = endpoint->endpoint_btl->device; mca_btl_openib_endpoint_t **p; OBJ_RETAIN(endpoint); assert(((opal_object_t*)endpoint)->obj_reference_count == 2); do { p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count]; } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint)); OPAL_THREAD_ADD32(&openib_btl->eager_rdma_channels, 1); /* from this point progress function starts to poll new buffer */ OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1); return; } openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); free_headers_buf: free(headers_buf); unlock_rdma_local: /* set local rdma pointer back to zero. Will retry later */ (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, endpoint->eager_rdma_local.base.pval, NULL); endpoint->eager_rdma_local.frags = NULL; }
static void put_constructor(mca_btl_openib_put_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_OPENIB_FRAG_SEND_USER; to_out_frag(frag)->sr_desc.opcode = IBV_WR_RDMA_WRITE; }
static void put_constructor(mca_btl_wv_put_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_WV_FRAG_SEND_USER; to_out_frag(frag)->sr_desc.Opcode = WvRdmaWrite; }
/** * This function is used to send a message to the remote side * indicating the endpoint is broken and telling the remote side to * brings its endpoint down as well. This is needed because there are * cases where only one side of the connection determines that the * there was a problem. * @param endpoint Pointer to endpoint with error * @param type Type of message to be sent, can be one of two types * @param index When sending RDMA error message, index is non zero */ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_module_t* newbtl = NULL; bool found = false; mca_btl_openib_broken_connection_header_t *bc_hdr; mca_btl_openib_send_control_frag_t* frag; mca_btl_base_endpoint_t* newep; int i, rc; opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal; /* First, find a different BTL than this one that got the * error to send the message over. */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { if (mca_btl_openib_component.openib_btls[i] != openib_btl) { newbtl = mca_btl_openib_component.openib_btls[i]; break; } } if (NULL == newbtl) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No BTL found"); /* If we cannot find one, then just return. */ return; } /* Now, find the endpoint associated with it. The device * associated with the BTL has the list of all the * endpoints. */ for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) { newep = (mca_btl_openib_endpoint_t*) opal_pointer_array_get_item(newbtl->device->endpoints, i); if (NULL == newep) { continue; } if (newep->endpoint_proc->proc_opal == remote_proc) { found = true; break; } } if (false == found) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No endpoint found"); /* If we cannot find a match, then just return. */ return; } frag = alloc_control_frag(newbtl); if(NULL == frag) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No frag space"); /* If no frag available, then just return. */ return; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_notify_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.base.seg_len = sizeof(mca_btl_openib_broken_connection_header_t); to_com_frag(frag)->endpoint = newep; frag->hdr->tag = MCA_BTL_TAG_IB; bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval; bc_hdr->control.type = type; bc_hdr->lid = endpoint->endpoint_btl->port_info.lid; bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id; bc_hdr->vpid = opal_process_name_vpid(OPAL_PROC_MY_NAME); bc_hdr->index = index; if(newep->nbo) { BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr)); } rc = mca_btl_openib_endpoint_send(newep, frag); if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) { return; } MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno))); return; }
/* Setup eager RDMA buffers and notify the remote endpoint*/ void mca_btl_wv_endpoint_connect_eager_rdma( mca_btl_wv_endpoint_t* endpoint) { mca_btl_wv_module_t* wv_btl = endpoint->endpoint_btl; char *buf; mca_btl_wv_recv_frag_t *headers_buf; int i; /* Set local rdma pointer to 1 temporarily so other threads will not try * to enter the function */ if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) return; headers_buf = (mca_btl_wv_recv_frag_t*) malloc(sizeof(mca_btl_wv_recv_frag_t) * mca_btl_wv_component.eager_rdma_num); if(NULL == headers_buf) goto unlock_rdma_local; buf = (char *) wv_btl->super.btl_mpool->mpool_alloc(wv_btl->super.btl_mpool, wv_btl->eager_rdma_frag_size * mca_btl_wv_component.eager_rdma_num, mca_btl_wv_component.buffer_alignment, MCA_MPOOL_FLAGS_CACHE_BYPASS, (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); if(!buf) goto free_headers_buf; buf = buf + wv_btl->eager_rdma_frag_size - sizeof(mca_btl_wv_footer_t) - wv_btl->super.btl_eager_limit - sizeof(mca_btl_wv_header_t); for(i = 0; i < mca_btl_wv_component.eager_rdma_num; i++) { ompi_free_list_item_t *item; mca_btl_wv_recv_frag_t * frag; mca_btl_wv_frag_init_data_t init_data; item = (ompi_free_list_item_t*)&headers_buf[i]; item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg; item->ptr = buf + i * wv_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_wv_recv_frag_t); init_data.order = mca_btl_wv_component.credits_qp; init_data.list = NULL; mca_btl_wv_frag_init(item, &init_data); frag = to_recv_frag(item); to_base_frag(frag)->type = MCA_BTL_WV_FRAG_EAGER_RDMA; to_com_frag(frag)->endpoint = endpoint; frag->ftr = (mca_btl_wv_footer_t*) ((char*)to_base_frag(frag)->segment.base.seg_addr.pval + mca_btl_wv_component.eager_limit); MCA_BTL_WV_RDMA_MAKE_REMOTE(frag->ftr); } endpoint->eager_rdma_local.frags = headers_buf; endpoint->eager_rdma_local.rd_win = mca_btl_wv_component.eager_rdma_num >> 2; endpoint->eager_rdma_local.rd_win = endpoint->eager_rdma_local.rd_win?endpoint->eager_rdma_local.rd_win:1; /* set local rdma pointer to real value */ opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1, buf); if(mca_btl_wv_endpoint_send_eager_rdma(endpoint) == OMPI_SUCCESS) { mca_btl_wv_device_t *device = endpoint->endpoint_btl->device; mca_btl_wv_endpoint_t **p; OBJ_RETAIN(endpoint); assert(((opal_object_t*)endpoint)->obj_reference_count == 2); do { p = &device->eager_rdma_buffers[device->eager_rdma_buffers_count]; } while(!opal_atomic_cmpset_ptr(p, NULL, endpoint)); OPAL_THREAD_ADD32(&wv_btl->eager_rdma_channels, 1); /* from this point progress function starts to poll new buffer */ OPAL_THREAD_ADD32(&device->eager_rdma_buffers_count, 1); return; } wv_btl->super.btl_mpool->mpool_free(wv_btl->super.btl_mpool, buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); free_headers_buf: free(headers_buf); unlock_rdma_local: /* set local rdma pointer back to zero. Will retry later */ opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, endpoint->eager_rdma_local.base.pval, NULL); endpoint->eager_rdma_local.frags = NULL; }
/* this function is called with endpoint->endpoint_lock held */ int mca_btl_wv_endpoint_post_send(mca_btl_wv_endpoint_t *endpoint, mca_btl_wv_send_frag_t *frag) { mca_btl_wv_header_t *hdr = frag->hdr; mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; int qp, ib_rc; int32_t cm_return; bool do_rdma = false; size_t eager_limit; if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) des->order = frag->qp_idx; qp = des->order; if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS) return OMPI_ERR_RESOURCE_BUSY; eager_limit = mca_btl_wv_component.eager_limit + sizeof(mca_btl_wv_header_coalesced_t) + sizeof(mca_btl_wv_control_header_t); if(des->des_src->seg_len + frag->coalesced_length <= eager_limit && (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) { /* High priority frag. Try to send over eager RDMA */ if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS) do_rdma = true; } if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) { qp_put_wqe(endpoint, qp); return OMPI_ERR_RESOURCE_BUSY; } BTL_WV_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits); if(hdr->credits) hdr->credits |= BTL_WV_RDMA_CREDITS_FLAG; if(!do_rdma) { if(BTL_WV_QP_TYPE_PP(qp) && 0 == hdr->credits) { BTL_WV_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); } } else { hdr->credits |= (qp << 11); } BTL_WV_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); /* cm_seen is only 8 bytes, but cm_return is 32 bytes */ if(cm_return > 255) { hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { hdr->cm_seen = cm_return; } ib_rc = post_send(endpoint, frag, do_rdma); if(!ib_rc) return OMPI_SUCCESS; if(endpoint->nbo) BTL_WV_HEADER_NTOH(*hdr); if(BTL_WV_IS_RDMA_CREDITS(hdr->credits)) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, BTL_WV_CREDITS(hdr->credits)); } qp_put_wqe(endpoint, qp); if(do_rdma) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); } else { if(BTL_WV_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); } else if BTL_WV_QP_TYPE_SRQ(qp){ mca_btl_wv_module_t *wv_btl = endpoint->endpoint_btl; OPAL_THREAD_ADD32(&wv_btl->qps[qp].u.srq_qp.sd_credits, 1); } } BTL_ERROR(("error posting send request error %d: %s\n", ib_rc, strerror(ib_rc))); return OMPI_ERROR; }
int mca_btl_openib_get (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_openib_get_frag_t* frag = NULL; int qp = order; int rc; if (OPAL_UNLIKELY(size > btl->btl_get_limit)) { return OPAL_ERR_BAD_PARAM; } frag = to_get_frag(alloc_recv_user_frag()); if (OPAL_UNLIKELY(NULL == frag)) { return OPAL_ERR_OUT_OF_RESOURCE; } if (MCA_BTL_NO_ORDER == qp) { qp = mca_btl_openib_component.rdma_qp; } /* set base descriptor flags */ to_base_frag(frag)->base.order = qp; /* free this descriptor when the operation is complete */ to_base_frag(frag)->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; /* set up scatter-gather entry */ to_com_frag(frag)->sg_entry.length = size; to_com_frag(frag)->sg_entry.lkey = local_handle->lkey; to_com_frag(frag)->sg_entry.addr = (uint64_t)(uintptr_t) local_address; to_com_frag(frag)->endpoint = ep; /* set up rdma callback */ frag->cb.func = cbfunc; frag->cb.context = cbcontext; frag->cb.data = cbdata; frag->cb.local_handle = local_handle; /* set up descriptor */ frag->sr_desc.wr.rdma.remote_addr = remote_address; /* the opcode may have been changed by an atomic operation */ frag->sr_desc.opcode = IBV_WR_RDMA_READ; #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT if((ep->endpoint_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) != (opal_proc_local_get()->proc_arch & OPAL_ARCH_ISBIGENDIAN)) { frag->sr_desc.wr.rdma.rkey = opal_swap_bytes4 (remote_handle->rkey); } else #endif { frag->sr_desc.wr.rdma.rkey = remote_handle->rkey; } #if HAVE_XRC if (MCA_BTL_XRC_ENABLED && BTL_OPENIB_QP_TYPE_XRC(qp)) { #if OPAL_HAVE_CONNECTX_XRC_DOMAINS frag->sr_desc.qp_type.xrc.remote_srqn = ep->rem_info.rem_srqs[qp].rem_srq_num; #else frag->sr_desc.xrc_remote_srq_num = ep->rem_info.rem_srqs[qp].rem_srq_num; #endif } #endif if (ep->endpoint_state != MCA_BTL_IB_CONNECTED) { OPAL_THREAD_LOCK(&ep->endpoint_lock); rc = check_endpoint_state(ep, &to_base_frag(frag)->base, &ep->pending_get_frags); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); if (OPAL_ERR_RESOURCE_BUSY == rc) { return OPAL_SUCCESS; } if (OPAL_SUCCESS != rc) { MCA_BTL_IB_FRAG_RETURN (frag); return rc; } } rc = mca_btl_openib_get_internal (btl, ep, frag); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { if (OPAL_LIKELY(OPAL_ERR_OUT_OF_RESOURCE == rc)) { rc = OPAL_SUCCESS; OPAL_THREAD_LOCK(&ep->endpoint_lock); opal_list_append(&ep->pending_get_frags, (opal_list_item_t*)frag); OPAL_THREAD_UNLOCK(&ep->endpoint_lock); } else { MCA_BTL_IB_FRAG_RETURN (frag); } } return rc; }
static void send_control_constructor(mca_btl_wv_send_control_frag_t *frag) { to_base_frag(frag)->type = MCA_BTL_WV_FRAG_CONTROL; }