int mca_btl_openib_get_internal (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *ep, mca_btl_openib_get_frag_t *frag) { int qp = to_base_frag(frag)->base.order; struct ibv_send_wr *bad_wr; /* check for a send wqe */ if (qp_get_wqe(ep, qp) < 0) { qp_put_wqe(ep, qp); return OPAL_ERR_OUT_OF_RESOURCE; } /* check for a get token */ if (OPAL_THREAD_ADD32(&ep->get_tokens,-1) < 0) { qp_put_wqe(ep, qp); OPAL_THREAD_ADD32(&ep->get_tokens,1); return OPAL_ERR_OUT_OF_RESOURCE; } qp_inflight_wqe_to_frag(ep, qp, to_com_frag(frag)); qp_reset_signal_count(ep, qp); if (ibv_post_send(ep->qps[qp].qp->lcl_qp, &frag->sr_desc, &bad_wr)) { qp_put_wqe(ep, qp); OPAL_THREAD_ADD32(&ep->get_tokens,1); return OPAL_ERROR; } return OPAL_SUCCESS; }
/* this function is called with endpoint->endpoint_lock held */ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag) { int prio = to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY; mca_btl_openib_header_t *hdr = frag->hdr; mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; int qp, ib_rc, rc; bool do_rdma = false; size_t size; if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) des->order = frag->qp_idx; qp = des->order; if(acquire_wqe(endpoint, frag) != OPAL_SUCCESS) return OPAL_ERR_RESOURCE_BUSY; size = des->des_segments->seg_len + frag->coalesced_length; rc = mca_btl_openib_endpoint_credit_acquire (endpoint, qp, prio, size, &do_rdma, frag, true); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { qp_put_wqe(endpoint, qp); return OPAL_ERR_RESOURCE_BUSY; } qp_reset_signal_count(endpoint, qp); ib_rc = post_send(endpoint, frag, do_rdma, 1); if(!ib_rc) return OPAL_SUCCESS; if(endpoint->nbo) BTL_OPENIB_HEADER_NTOH(*hdr); mca_btl_openib_endpoint_credit_release (endpoint, qp, do_rdma, frag); qp_put_wqe(endpoint, qp); BTL_ERROR(("error posting send request error %d: %s. size = %lu\n", ib_rc, strerror(ib_rc), size)); return OPAL_ERROR; }
static inline int acquire_wqe(mca_btl_openib_endpoint_t *ep, mca_btl_openib_send_frag_t *frag) { int qp = to_base_frag(frag)->base.order; int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY); if(qp_get_wqe(ep, qp) < 0) { qp_put_wqe(ep, qp); opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio], (opal_list_item_t *)frag); return OPAL_ERR_OUT_OF_RESOURCE; } return OPAL_SUCCESS; }
/** * This function is called when we get an error on the completion * event of a fragment. We check to see what type of fragment it is * and act accordingly. In most cases, we first call up into the PML * and have it map out this connection for any future communication. * In addition, this function will possibly send some control messages * over the other openib BTL. The first control message will tell the * remote side to also map out this connection. The second control * message makes sure the eager RDMA connection remains in a sane * state. See that function for more details. * @param openib_btl Pointer to BTL that had the error * @param des Pointer to descriptor that had the error * @param qp Queue pair that had the error * @param remote_proc Pointer to process that had the error * @param endpoint Pointer to endpoint that had the error */ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, mca_btl_base_descriptor_t *des, int qp, opal_proc_t* remote_proc, mca_btl_openib_endpoint_t* endpoint) { char *btlname = NULL; int btl_ownership; /* Since this BTL supports failover, it will call the PML error handler * function with the NONFATAL flag. If the PML is running with failover * support, then it will map out the endpoint for further communication * and return control here. If the PML does not have failover support, * it will abort the job and control will not return here. */ /* Note: At this point, what needs to be done is based on the type * of openib fragment that got the error. Also note that in the wc * struct, when wc->status != IBV_WC_SUCCESS, these are the only * valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num. * This means that one cannot key off of the wc->opcode to see what * operation was done. The important information needs to be read * from the fragment. */ /* Cannot issue callback to SRQ errors because the shared receive * queue is shared and is not specific to a connection. There is no * way to figure out what type of message created the error because * we need the information in the wc->imm_data field which does not * exist when we have an error. So, nothing to do here but return. */ if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "SRQ RECV type=%d", openib_frag_type(des)); /* Need to think about returning any shared resources of the * SRQ. For now, we do nothing as we rarely see an error on * the SRQ. */ return; } assert(NULL != remote_proc); /* Create a nice string to help with debug */ if (NULL != openib_btl) { asprintf(&btlname, "lid=%d:name=%s", openib_btl->lid, openib_btl->device->ib_dev->name); } /* The next set of errors are associated with an endpoint, but not * with a PML descriptor. They are not associated with a PML * descriptor because: * A. It was a receive * B. It was some type of openib specific control message. * Therefore, just drop the fragments and call up into the PML to * disable this endpoint for future communication. */ if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) || (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) || (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) { openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); /* Now that this connection has been mapped out at the PML layer, * we change the state in the BTL layer. The change in the PML * layer should prevent that we ever try to send on this BTL * again. If we do, then this is an error case. */ if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { endpoint->endpoint_state = MCA_BTL_IB_FAILED; mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); error_out_all_pending_frags(endpoint, &openib_btl->super, true); } opal_output_verbose(60, mca_btl_openib_component.verbose_failover, "MCA_BTL_OPENIG_FRAG=%d, " "dropping since connection is broken (des=%lx)", openib_frag_type(des), (long unsigned int) des); if (NULL != btlname) free(btlname); return; } /* These are RDMA read type fragments. Just continue with processing */ if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) { OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "OPENIB_FRAG_RECV_USER fragment, " "btl=%lx, continue with callbacks", (long unsigned int) &openib_btl->super); } /* If we are at this point, we have completed a send, RDMA read or * RDMA write. Call the PML callback function to map out this * btl for further sending. We just call this every time we get an * error even though it is not necessary. Subsequent calls with * the same remote_proc argument will not actually map anything out. */ openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); if (NULL != btlname) free(btlname); /* Since we believe we have done a send, read or write, then the * des_local fields should have valid data. */ assert(des->des_local != NULL); /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then * change the status. Since this connection was mapped out in the * PML layer, no more attempts should be made to send on it. In * addition, send a message to other end of the connection letting * it know that this side is now broken. This is needed in the case * of a spurious error which may not cause the remote side to detect * the error. */ if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { endpoint->endpoint_state = MCA_BTL_IB_FAILED; mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); } /* Now, call the callback function associated with the fragment. * In case the fragments were coalesced we need to pull them apart * and call the callback function for each one. */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base); } } } /* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER * or MCA_BTL_OPENIB_FRAG_RECV_USER. */ btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, des); } /* Here we send another control message to notify the remote side * we had an error on a eager fragment. A non-zero value for the * ftr variable indicates that this was an eager RDMA fragment. * We need to do this in case the eager RDMA fragment after this * one actually made it successfully. */ if (0 != to_send_frag(des)->ftr) { mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR, (long)to_send_frag(des)->ftr - 1); } /* We know we have completed a send so return some resources even * though connection is broken. With SRQ, the resources are shared * so if we do not return the credits we may not be allowed to send * anymore. */ qp_put_wqe(endpoint, qp); if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); } /* There are several queues associated with an endpoint that may * have some unsent fragments sitting in them. Remove them and * call the callback functions with an error so the PML can send * them down a different path. This really only needs to be called * once on an endpoint, but for now, just call it a bunch of times. * The first time through will remove the unsent fragments so * subsequent calls are no-ops. */ if (endpoint) { error_out_all_pending_frags(endpoint, &openib_btl->super, true); } }
/* this function is called with endpoint->endpoint_lock held */ int mca_btl_wv_endpoint_post_send(mca_btl_wv_endpoint_t *endpoint, mca_btl_wv_send_frag_t *frag) { mca_btl_wv_header_t *hdr = frag->hdr; mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; int qp, ib_rc; int32_t cm_return; bool do_rdma = false; size_t eager_limit; if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) des->order = frag->qp_idx; qp = des->order; if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS) return OMPI_ERR_RESOURCE_BUSY; eager_limit = mca_btl_wv_component.eager_limit + sizeof(mca_btl_wv_header_coalesced_t) + sizeof(mca_btl_wv_control_header_t); if(des->des_src->seg_len + frag->coalesced_length <= eager_limit && (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) { /* High priority frag. Try to send over eager RDMA */ if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS) do_rdma = true; } if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) { qp_put_wqe(endpoint, qp); return OMPI_ERR_RESOURCE_BUSY; } BTL_WV_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits); if(hdr->credits) hdr->credits |= BTL_WV_RDMA_CREDITS_FLAG; if(!do_rdma) { if(BTL_WV_QP_TYPE_PP(qp) && 0 == hdr->credits) { BTL_WV_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); } } else { hdr->credits |= (qp << 11); } BTL_WV_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); /* cm_seen is only 8 bytes, but cm_return is 32 bytes */ if(cm_return > 255) { hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { hdr->cm_seen = cm_return; } ib_rc = post_send(endpoint, frag, do_rdma); if(!ib_rc) return OMPI_SUCCESS; if(endpoint->nbo) BTL_WV_HEADER_NTOH(*hdr); if(BTL_WV_IS_RDMA_CREDITS(hdr->credits)) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, BTL_WV_CREDITS(hdr->credits)); } qp_put_wqe(endpoint, qp); if(do_rdma) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); } else { if(BTL_WV_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); } else if BTL_WV_QP_TYPE_SRQ(qp){ mca_btl_wv_module_t *wv_btl = endpoint->endpoint_btl; OPAL_THREAD_ADD32(&wv_btl->qps[qp].u.srq_qp.sd_credits, 1); } } BTL_ERROR(("error posting send request error %d: %s\n", ib_rc, strerror(ib_rc))); return OMPI_ERROR; }