/* * Create the local side of all the qp's. The remote sides will be * connected later. */ static int qp_create_all(mca_btl_base_endpoint_t* endpoint) { int qp, rc, pp_qp_num = 0; int32_t rd_rsv_total = 0; for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) if(BTL_OPENIB_QP_TYPE_PP(qp)) { rd_rsv_total += mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv; pp_qp_num++; } /* if there is no pp QPs we still need reserved WQE for eager rdma flow * control */ if(0 == pp_qp_num && true == endpoint->use_eager_rdma) pp_qp_num = 1; for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { struct ibv_srq *srq = NULL; uint32_t max_recv_wr, max_send_wr; int32_t rd_rsv, rd_num_credits; /* QP used for SW flow control need some additional recourses */ if(qp == mca_btl_openib_component.credits_qp) { rd_rsv = rd_rsv_total; rd_num_credits = pp_qp_num; } else { rd_rsv = rd_num_credits = 0; } if(BTL_OPENIB_QP_TYPE_PP(qp)) { max_recv_wr = mca_btl_openib_component.qp_infos[qp].rd_num + rd_rsv; max_send_wr = mca_btl_openib_component.qp_infos[qp].rd_num + rd_num_credits; } else { srq = endpoint->endpoint_btl->qps[qp].u.srq_qp.srq; /* no receives are posted to SRQ qp */ max_recv_wr = 0; max_send_wr = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max + rd_num_credits; } rc = qp_create_one(endpoint, qp, srq, max_recv_wr, max_send_wr); if (OMPI_SUCCESS != rc) { return rc; } } /* Now that all the qp's are created locally, post some receive buffers, setup credits, etc. */ return mca_btl_openib_endpoint_post_recvs(endpoint); }
static int acquire_send_credit(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag) { mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; int qp = to_base_frag(frag)->base.order; int prio = !(to_base_frag(frag)->base.des_flags & MCA_BTL_DES_FLAGS_PRIORITY); if(BTL_OPENIB_QP_TYPE_PP(qp)) { if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, -1) < 0) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); opal_list_append(&endpoint->qps[qp].no_credits_pending_frags[prio], (opal_list_item_t *)frag); return OMPI_ERR_OUT_OF_RESOURCE; } } else { if(OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, -1) < 0) { OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); OPAL_THREAD_LOCK(&openib_btl->ib_lock); opal_list_append(&openib_btl->qps[qp].u.srq_qp.pending_frags[prio], (opal_list_item_t *)frag); OPAL_THREAD_UNLOCK(&openib_btl->ib_lock); return OMPI_ERR_OUT_OF_RESOURCE; } } return OMPI_SUCCESS; }
/* The main idea of resizing SRQ algorithm - We create a SRQ with size = rd_num, but for efficient usage of resources the number of WQEs that we post = rd_curr_num < rd_num and this value is increased (by needs) in IBV_EVENT_SRQ_LIMIT_REACHED event handler (i.e. in this function), the event will thrown by device if number of WQEs in SRQ will be less than srq_limit */ static int btl_openib_async_srq_limit_event(struct ibv_srq* srq) { int qp, rc = OPAL_SUCCESS; mca_btl_openib_module_t *openib_btl = NULL; opal_mutex_t *lock = &mca_btl_openib_component.srq_manager.lock; opal_hash_table_t *srq_addr_table = &mca_btl_openib_component.srq_manager.srq_addr_table; opal_mutex_lock(lock); if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(srq_addr_table, &srq, sizeof(struct ibv_srq*), (void*) &openib_btl)) { /* If there isn't any element with the key in the table => we assume that SRQ was destroyed and don't serve the event */ goto srq_limit_event_exit; } for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { if (!BTL_OPENIB_QP_TYPE_PP(qp)) { if(openib_btl->qps[qp].u.srq_qp.srq == srq) { break; } } } if(qp >= mca_btl_openib_component.num_qps) { BTL_ERROR(("Open MPI tried to access a shared receive queue (SRQ) on the device %s that was not found. This should not happen, and is a fatal error. Your MPI job will now abort.\n", ibv_get_device_name(openib_btl->device->ib_dev))); rc = OPAL_ERROR; goto srq_limit_event_exit; } /* dynamically re-size the SRQ to be larger */ openib_btl->qps[qp].u.srq_qp.rd_curr_num <<= 1; if(openib_btl->qps[qp].u.srq_qp.rd_curr_num >= mca_btl_openib_component.qp_infos[qp].rd_num) { openib_btl->qps[qp].u.srq_qp.rd_curr_num = mca_btl_openib_component.qp_infos[qp].rd_num; openib_btl->qps[qp].u.srq_qp.rd_low_local = mca_btl_openib_component.qp_infos[qp].rd_low; openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = false; goto srq_limit_event_exit; } openib_btl->qps[qp].u.srq_qp.rd_low_local <<= 1; openib_btl->qps[qp].u.srq_qp.srq_limit_event_flag = true; srq_limit_event_exit: opal_mutex_unlock(lock); return rc; }
/* * Called when the connect module has created all the qp's on an * endpoint and needs to have some receive buffers posted. */ int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t *endpoint) { int qp; for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { if (BTL_OPENIB_QP_TYPE_PP(qp)) { mca_btl_openib_endpoint_post_rr_nolock(endpoint, qp); } else { mca_btl_openib_post_srr(endpoint->endpoint_btl, qp); } } return OPAL_SUCCESS; }
/* * Find all the CPCs that are eligible for a single local port (i.e., * openib module). */ int ompi_btl_openib_connect_base_select_for_local_port(mca_btl_openib_module_t *btl) { char *msg = NULL; int i, rc, cpc_index, len; ompi_btl_openib_connect_base_module_t **cpcs; cpcs = calloc(num_available, sizeof(ompi_btl_openib_connect_base_module_t *)); if (NULL == cpcs) { return OMPI_ERR_OUT_OF_RESOURCE; } /* Go through all available CPCs and query them to see if they want to run on this module. If they do, save them to a running array. */ for (len = 1, i = 0; NULL != available[i]; ++i) { len += strlen(available[i]->cbc_name) + 2; } msg = malloc(len); if (NULL == msg) { return OMPI_ERR_OUT_OF_RESOURCE; } msg[0] = '\0'; for (cpc_index = i = 0; NULL != available[i]; ++i) { if (i > 0) { strcat(msg, ", "); } strcat(msg, available[i]->cbc_name); rc = available[i]->cbc_query(btl, &cpcs[cpc_index]); if (OMPI_ERR_NOT_SUPPORTED == rc || OMPI_ERR_UNREACH == rc) { continue; } else if (OMPI_SUCCESS != rc) { free(cpcs); free(msg); return rc; } opal_output(-1, "match cpc for local port: %s", available[i]->cbc_name); /* If the CPC wants to use the CTS protocol, check to ensure that QP 0 is PP; if it's not, we can't use this CPC (or the CTS protocol) */ if (cpcs[cpc_index]->cbm_uses_cts && !BTL_OPENIB_QP_TYPE_PP(0)) { BTL_VERBOSE(("this CPC only supports when the first btl_openib_receive_queues QP is a PP QP")); continue; } /* This CPC has indicated that it wants to run on this openib BTL module. Woo hoo! */ ++cpc_index; } /* If we got an empty array, then no CPCs were eligible. Doh! */ if (0 == cpc_index) { orte_show_help("help-mpi-btl-openib-cpc-base.txt", "no cpcs for port", true, orte_process_info.nodename, ibv_get_device_name(btl->device->ib_dev), btl->port_num, msg); free(cpcs); free(msg); return OMPI_ERR_NOT_SUPPORTED; } free(msg); /* We got at least one eligible CPC; save the array into the module's port_info */ btl->cpcs = cpcs; btl->num_cpcs = cpc_index; return OMPI_SUCCESS; }
/** * This function is called when we get an error on the completion * event of a fragment. We check to see what type of fragment it is * and act accordingly. In most cases, we first call up into the PML * and have it map out this connection for any future communication. * In addition, this function will possibly send some control messages * over the other openib BTL. The first control message will tell the * remote side to also map out this connection. The second control * message makes sure the eager RDMA connection remains in a sane * state. See that function for more details. * @param openib_btl Pointer to BTL that had the error * @param des Pointer to descriptor that had the error * @param qp Queue pair that had the error * @param remote_proc Pointer to process that had the error * @param endpoint Pointer to endpoint that had the error */ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, mca_btl_base_descriptor_t *des, int qp, opal_proc_t* remote_proc, mca_btl_openib_endpoint_t* endpoint) { char *btlname = NULL; int btl_ownership; /* Since this BTL supports failover, it will call the PML error handler * function with the NONFATAL flag. If the PML is running with failover * support, then it will map out the endpoint for further communication * and return control here. If the PML does not have failover support, * it will abort the job and control will not return here. */ /* Note: At this point, what needs to be done is based on the type * of openib fragment that got the error. Also note that in the wc * struct, when wc->status != IBV_WC_SUCCESS, these are the only * valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num. * This means that one cannot key off of the wc->opcode to see what * operation was done. The important information needs to be read * from the fragment. */ /* Cannot issue callback to SRQ errors because the shared receive * queue is shared and is not specific to a connection. There is no * way to figure out what type of message created the error because * we need the information in the wc->imm_data field which does not * exist when we have an error. So, nothing to do here but return. */ if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "SRQ RECV type=%d", openib_frag_type(des)); /* Need to think about returning any shared resources of the * SRQ. For now, we do nothing as we rarely see an error on * the SRQ. */ return; } assert(NULL != remote_proc); /* Create a nice string to help with debug */ if (NULL != openib_btl) { asprintf(&btlname, "lid=%d:name=%s", openib_btl->lid, openib_btl->device->ib_dev->name); } /* The next set of errors are associated with an endpoint, but not * with a PML descriptor. They are not associated with a PML * descriptor because: * A. It was a receive * B. It was some type of openib specific control message. * Therefore, just drop the fragments and call up into the PML to * disable this endpoint for future communication. */ if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) || (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) || (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) { openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); /* Now that this connection has been mapped out at the PML layer, * we change the state in the BTL layer. The change in the PML * layer should prevent that we ever try to send on this BTL * again. If we do, then this is an error case. */ if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { endpoint->endpoint_state = MCA_BTL_IB_FAILED; mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); error_out_all_pending_frags(endpoint, &openib_btl->super, true); } opal_output_verbose(60, mca_btl_openib_component.verbose_failover, "MCA_BTL_OPENIG_FRAG=%d, " "dropping since connection is broken (des=%lx)", openib_frag_type(des), (long unsigned int) des); if (NULL != btlname) free(btlname); return; } /* These are RDMA read type fragments. Just continue with processing */ if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) { OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "OPENIB_FRAG_RECV_USER fragment, " "btl=%lx, continue with callbacks", (long unsigned int) &openib_btl->super); } /* If we are at this point, we have completed a send, RDMA read or * RDMA write. Call the PML callback function to map out this * btl for further sending. We just call this every time we get an * error even though it is not necessary. Subsequent calls with * the same remote_proc argument will not actually map anything out. */ openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); if (NULL != btlname) free(btlname); /* Since we believe we have done a send, read or write, then the * des_local fields should have valid data. */ assert(des->des_local != NULL); /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then * change the status. Since this connection was mapped out in the * PML layer, no more attempts should be made to send on it. In * addition, send a message to other end of the connection letting * it know that this side is now broken. This is needed in the case * of a spurious error which may not cause the remote side to detect * the error. */ if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { endpoint->endpoint_state = MCA_BTL_IB_FAILED; mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); } /* Now, call the callback function associated with the fragment. * In case the fragments were coalesced we need to pull them apart * and call the callback function for each one. */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base); } } } /* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER * or MCA_BTL_OPENIB_FRAG_RECV_USER. */ btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, des); } /* Here we send another control message to notify the remote side * we had an error on a eager fragment. A non-zero value for the * ftr variable indicates that this was an eager RDMA fragment. * We need to do this in case the eager RDMA fragment after this * one actually made it successfully. */ if (0 != to_send_frag(des)->ftr) { mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR, (long)to_send_frag(des)->ftr - 1); } /* We know we have completed a send so return some resources even * though connection is broken. With SRQ, the resources are shared * so if we do not return the credits we may not be allowed to send * anymore. */ qp_put_wqe(endpoint, qp); if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); } /* There are several queues associated with an endpoint that may * have some unsent fragments sitting in them. Remove them and * call the callback functions with an error so the PML can send * them down a different path. This really only needs to be called * once on an endpoint, but for now, just call it a bunch of times. * The first time through will remove the unsent fragments so * subsequent calls are no-ops. */ if (endpoint) { error_out_all_pending_frags(endpoint, &openib_btl->super, true); } }
/** * This function will find all the pending fragments on an endpoint * and call the callback function with OPAL_ERROR. It walks through * each qp with each priority and looks for both no_credits_pending_frags * and no_wqe_pending_frags. It then looks for any pending_lazy_frags, * pending_put_frags, and pending_get_frags. This function is only * called when running with failover support enabled. Note that * the errout parameter allows the function to also be used as a * debugging tool to see if there are any fragments on any of the * queues. * @param ep Pointer to endpoint that had error * @param module Pointer to module that had error * @param errout Boolean which says whether to error them out or not */ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep, struct mca_btl_base_module_t* module, bool errout) { int qp, pri, len, total, btl_ownership; opal_list_item_t *item; mca_btl_openib_com_frag_t* frag; mca_btl_base_descriptor_t *des; int verbose = 10; /* Verbosity level unless debugging */ /* If debugging, drop verbosity level so we can see the output * regardless of the level the program was run with. */ if (false == errout) { verbose = 0; } total = 0; /* Traverse all QPs and all priorities and move to other endpoint */ for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { for (pri = 0; pri < 2; ++pri) { /* All types of qp's have a no_wqe_pending_frags list */ len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for no_wqe_pending_frags qp=%d, " "pri=%d, list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->qps[qp]. no_wqe_pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in no_wqe_pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } if (BTL_OPENIB_QP_TYPE_PP(qp)) { len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for no_credits_pending_frags qp=%d, " "pri=%d, list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->qps[qp]. no_credits_pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in " "no_credits_pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } } else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for srq pending_frags qp=%d, pri=%d, " "list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp]. u.srq_qp.pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in SRQ pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } } } } /* Check for any frags from a connection that was never made. Not sure if this * can actually happen. */ len = opal_list_get_size(&ep->pending_lazy_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_lazy_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } len = opal_list_get_size(&ep->pending_put_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_put_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } len = opal_list_get_size(&ep->pending_get_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_get_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover, "IB: Finished checking for pending_frags, total moved=%d", total); }
/* this function is called with endpoint->endpoint_lock held */ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag) { mca_btl_openib_header_t *hdr = frag->hdr; mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; int qp, ib_rc; int32_t cm_return; bool do_rdma = false; size_t eager_limit; if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) des->order = frag->qp_idx; qp = des->order; if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS) return OMPI_ERR_RESOURCE_BUSY; eager_limit = mca_btl_openib_component.eager_limit + sizeof(mca_btl_openib_header_coalesced_t) + sizeof(mca_btl_openib_control_header_t); if(des->des_src->seg_len + frag->coalesced_length <= eager_limit && (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) { /* High priority frag. Try to send over eager RDMA */ if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS) do_rdma = true; } if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) { qp_put_wqe(endpoint, qp); return OMPI_ERR_RESOURCE_BUSY; } BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits); if(hdr->credits) hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG; if(!do_rdma) { if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) { BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); } } else { hdr->credits |= (qp << 11); } BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); /* cm_seen is only 8 bytes, but cm_return is 32 bytes */ if(cm_return > 255) { hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { hdr->cm_seen = cm_return; } qp_reset_signal_count(endpoint, qp); ib_rc = post_send(endpoint, frag, do_rdma, 1); if(!ib_rc) return OMPI_SUCCESS; if(endpoint->nbo) BTL_OPENIB_HEADER_NTOH(*hdr); if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, BTL_OPENIB_CREDITS(hdr->credits)); } qp_put_wqe(endpoint, qp); if(do_rdma) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); } else { if(BTL_OPENIB_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); } else if BTL_OPENIB_QP_TYPE_SRQ(qp){ mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); } } BTL_ERROR(("error posting send request error %d: %s\n", ib_rc, strerror(ib_rc))); return OMPI_ERROR; }
/* * Create the local side of one qp. The remote side will be connected * later. */ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp, struct ibv_srq *srq, uint32_t max_recv_wr, uint32_t max_send_wr) { mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; struct ibv_qp *my_qp; struct ibv_qp_init_attr init_attr; struct ibv_qp_attr attr; size_t req_inline; memset(&init_attr, 0, sizeof(init_attr)); memset(&attr, 0, sizeof(attr)); init_attr.qp_type = IBV_QPT_RC; init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ]; init_attr.recv_cq = openib_btl->device->ib_cq[qp_cq_prio(qp)]; init_attr.srq = srq; init_attr.cap.max_inline_data = req_inline = max_inline_size(qp, openib_btl->device); init_attr.cap.max_send_sge = 1; init_attr.cap.max_recv_sge = 1; /* we do not use SG list */ if(BTL_OPENIB_QP_TYPE_PP(qp)) { init_attr.cap.max_recv_wr = max_recv_wr; } else { init_attr.cap.max_recv_wr = 0; } init_attr.cap.max_send_wr = max_send_wr; my_qp = ibv_create_qp(openib_btl->device->ib_pd, &init_attr); if (NULL == my_qp) { BTL_ERROR(("error creating qp errno says %s", strerror(errno))); return OMPI_ERROR; } endpoint->qps[qp].qp->lcl_qp = my_qp; if (init_attr.cap.max_inline_data < req_inline) { endpoint->qps[qp].ib_inline_max = init_attr.cap.max_inline_data; orte_show_help("help-mpi-btl-openib-cpc-base.txt", "inline truncated", true, orte_process_info.nodename, ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num, req_inline, init_attr.cap.max_inline_data); } else { endpoint->qps[qp].ib_inline_max = req_inline; } attr.qp_state = IBV_QPS_INIT; attr.pkey_index = openib_btl->pkey_index; attr.port_num = openib_btl->port_num; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; if (ibv_modify_qp(endpoint->qps[qp].qp->lcl_qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS )) { BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno))); return OMPI_ERROR; } /* Setup meta data on the endpoint */ endpoint->qps[qp].qp->lcl_psn = lrand48() & 0xffffff; endpoint->qps[qp].credit_frag = NULL; return OMPI_SUCCESS; }
/* * Connect the local ends of all qp's to the remote side */ static int qp_connect_all(mca_btl_openib_endpoint_t *endpoint) { int i; mca_btl_openib_module_t* openib_btl = (mca_btl_openib_module_t*)endpoint->endpoint_btl; for (i = 0; i < mca_btl_openib_component.num_qps; i++) { struct ibv_qp_attr attr; struct ibv_qp* qp = endpoint->qps[i].qp->lcl_qp; enum ibv_mtu mtu = (openib_btl->device->mtu < endpoint->rem_info.rem_mtu) ? openib_btl->device->mtu : endpoint->rem_info.rem_mtu; memset(&attr, 0, sizeof(attr)); attr.qp_state = IBV_QPS_RTR; attr.path_mtu = mtu; attr.dest_qp_num = endpoint->rem_info.rem_qps[i].rem_qp_num; attr.rq_psn = endpoint->rem_info.rem_qps[i].rem_psn; attr.max_dest_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; attr.min_rnr_timer = mca_btl_openib_component.ib_min_rnr_timer; attr.ah_attr.is_global = 0; attr.ah_attr.dlid = endpoint->rem_info.rem_lid; attr.ah_attr.sl = mca_btl_openib_component.ib_service_level; attr.ah_attr.src_path_bits = openib_btl->src_path_bits; attr.ah_attr.port_num = openib_btl->port_num; /* JMS to be filled in later dynamically */ attr.ah_attr.static_rate = 0; if (mca_btl_openib_component.verbose) { BTL_OUTPUT(("Set MTU to IBV value %d (%s bytes)", mtu, (mtu == IBV_MTU_256) ? "256" : (mtu == IBV_MTU_512) ? "512" : (mtu == IBV_MTU_1024) ? "1024" : (mtu == IBV_MTU_2048) ? "2048" : (mtu == IBV_MTU_4096) ? "4096" : "unknown (!)")); } if (ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { BTL_ERROR(("error modifing QP to RTR errno says %s", strerror(errno))); return OMPI_ERROR; } attr.qp_state = IBV_QPS_RTS; attr.timeout = mca_btl_openib_component.ib_timeout; attr.retry_cnt = mca_btl_openib_component.ib_retry_count; /* On PP QPs we have SW flow control, no need for rnr retries. Setting * it to zero helps to catch bugs */ attr.rnr_retry = BTL_OPENIB_QP_TYPE_PP(i) ? 0 : mca_btl_openib_component.ib_rnr_retry; attr.sq_psn = endpoint->qps[i].qp->lcl_psn; attr.max_rd_atomic = mca_btl_openib_component.ib_max_rdma_dst_ops; if (ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { BTL_ERROR(("error modifying QP to RTS errno says %s", strerror(errno))); return OMPI_ERROR; } } return OMPI_SUCCESS; }