/* * called when the connect module has completed setup of an endpoint */ void mca_btl_wv_endpoint_connected(mca_btl_wv_endpoint_t *endpoint) { opal_list_item_t *frag_item; mca_btl_wv_send_frag_t *frag; bool master = false; opal_output(-1, "Now we are CONNECTED"); endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; endpoint->endpoint_btl->device->non_eager_rdma_endpoints++; /* The connection is correctly setup. Now we can decrease the event trigger. */ opal_progress_event_users_decrement(); /* Process pending packet on the endpoint */ /* While there are frags in the list, process them */ while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) { frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags)); frag = to_send_frag(frag_item); /* We need to post this one */ if(OMPI_SUCCESS != mca_btl_wv_endpoint_post_send(endpoint, frag)) BTL_ERROR(("Error posting send")); } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); /* if upper layer called put or get before connection moved to connected * state then we restart them here */ mca_btl_wv_frag_progress_pending_put_get(endpoint, mca_btl_wv_component.rdma_qp); }
void mca_btl_wv_frag_init(ompi_free_list_item_t* item, void* ctx) { mca_btl_wv_frag_init_data_t* init_data = (mca_btl_wv_frag_init_data_t *) ctx; mca_btl_wv_frag_t *frag = to_base_frag(item); if(MCA_BTL_WV_FRAG_RECV == frag->type) { to_recv_frag(frag)->qp_idx = init_data->order; to_com_frag(frag)->sg_entry.Length = mca_btl_wv_component.qp_infos[init_data->order].size + sizeof(mca_btl_wv_header_t) + sizeof(mca_btl_wv_header_coalesced_t) + sizeof(mca_btl_wv_control_header_t); } if(MCA_BTL_WV_FRAG_SEND == frag->type) to_send_frag(frag)->qp_idx = init_data->order; frag->list = init_data->list; }
/** * This function is called when we get an error on the completion * event of a fragment. We check to see what type of fragment it is * and act accordingly. In most cases, we first call up into the PML * and have it map out this connection for any future communication. * In addition, this function will possibly send some control messages * over the other openib BTL. The first control message will tell the * remote side to also map out this connection. The second control * message makes sure the eager RDMA connection remains in a sane * state. See that function for more details. * @param openib_btl Pointer to BTL that had the error * @param des Pointer to descriptor that had the error * @param qp Queue pair that had the error * @param remote_proc Pointer to process that had the error * @param endpoint Pointer to endpoint that had the error */ void mca_btl_openib_handle_endpoint_error(mca_btl_openib_module_t *openib_btl, mca_btl_base_descriptor_t *des, int qp, opal_proc_t* remote_proc, mca_btl_openib_endpoint_t* endpoint) { char *btlname = NULL; int btl_ownership; /* Since this BTL supports failover, it will call the PML error handler * function with the NONFATAL flag. If the PML is running with failover * support, then it will map out the endpoint for further communication * and return control here. If the PML does not have failover support, * it will abort the job and control will not return here. */ /* Note: At this point, what needs to be done is based on the type * of openib fragment that got the error. Also note that in the wc * struct, when wc->status != IBV_WC_SUCCESS, these are the only * valid fields: wc->wr_id, wc->status, wc->vendor_err, wc->qp_num. * This means that one cannot key off of the wc->opcode to see what * operation was done. The important information needs to be read * from the fragment. */ /* Cannot issue callback to SRQ errors because the shared receive * queue is shared and is not specific to a connection. There is no * way to figure out what type of message created the error because * we need the information in the wc->imm_data field which does not * exist when we have an error. So, nothing to do here but return. */ if ((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && !BTL_OPENIB_QP_TYPE_PP(qp)) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "SRQ RECV type=%d", openib_frag_type(des)); /* Need to think about returning any shared resources of the * SRQ. For now, we do nothing as we rarely see an error on * the SRQ. */ return; } assert(NULL != remote_proc); /* Create a nice string to help with debug */ if (NULL != openib_btl) { asprintf(&btlname, "lid=%d:name=%s", openib_btl->lid, openib_btl->device->ib_dev->name); } /* The next set of errors are associated with an endpoint, but not * with a PML descriptor. They are not associated with a PML * descriptor because: * A. It was a receive * B. It was some type of openib specific control message. * Therefore, just drop the fragments and call up into the PML to * disable this endpoint for future communication. */ if (((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV) && (BTL_OPENIB_QP_TYPE_PP(qp))) || (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_CONTROL) || (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_EAGER_RDMA)) { openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); /* Now that this connection has been mapped out at the PML layer, * we change the state in the BTL layer. The change in the PML * layer should prevent that we ever try to send on this BTL * again. If we do, then this is an error case. */ if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { endpoint->endpoint_state = MCA_BTL_IB_FAILED; mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); error_out_all_pending_frags(endpoint, &openib_btl->super, true); } opal_output_verbose(60, mca_btl_openib_component.verbose_failover, "MCA_BTL_OPENIG_FRAG=%d, " "dropping since connection is broken (des=%lx)", openib_frag_type(des), (long unsigned int) des); if (NULL != btlname) free(btlname); return; } /* These are RDMA read type fragments. Just continue with processing */ if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_RECV_USER) { OPAL_THREAD_ADD32(&endpoint->get_tokens, 1); opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "OPENIB_FRAG_RECV_USER fragment, " "btl=%lx, continue with callbacks", (long unsigned int) &openib_btl->super); } /* If we are at this point, we have completed a send, RDMA read or * RDMA write. Call the PML callback function to map out this * btl for further sending. We just call this every time we get an * error even though it is not necessary. Subsequent calls with * the same remote_proc argument will not actually map anything out. */ openib_btl->error_cb(&openib_btl->super, MCA_BTL_ERROR_FLAGS_NONFATAL, remote_proc, btlname); if (NULL != btlname) free(btlname); /* Since we believe we have done a send, read or write, then the * des_local fields should have valid data. */ assert(des->des_local != NULL); /* If the endpoint is not yet in the MCA_BTL_IB_CLOSED state, then * change the status. Since this connection was mapped out in the * PML layer, no more attempts should be made to send on it. In * addition, send a message to other end of the connection letting * it know that this side is now broken. This is needed in the case * of a spurious error which may not cause the remote side to detect * the error. */ if (MCA_BTL_IB_FAILED != endpoint->endpoint_state) { endpoint->endpoint_state = MCA_BTL_IB_FAILED; mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_BROKEN, 0); } /* Now, call the callback function associated with the fragment. * In case the fragments were coalesced we need to pull them apart * and call the callback function for each one. */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(&openib_btl->super, endpoint, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, &to_base_frag(i)->base); } } } /* This must be a MCA_BTL_OPENIB_FRAG_SEND, MCA_BTL_OPENIB_FRAG_SEND_USER * or MCA_BTL_OPENIB_FRAG_RECV_USER. */ btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(&openib_btl->super, endpoint, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(&openib_btl->super, des); } /* Here we send another control message to notify the remote side * we had an error on a eager fragment. A non-zero value for the * ftr variable indicates that this was an eager RDMA fragment. * We need to do this in case the eager RDMA fragment after this * one actually made it successfully. */ if (0 != to_send_frag(des)->ftr) { mca_btl_openib_endpoint_notify(endpoint, MCA_BTL_OPENIB_CONTROL_EP_EAGER_RDMA_ERROR, (long)to_send_frag(des)->ftr - 1); } /* We know we have completed a send so return some resources even * though connection is broken. With SRQ, the resources are shared * so if we do not return the credits we may not be allowed to send * anymore. */ qp_put_wqe(endpoint, qp); if((openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) && !BTL_OPENIB_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); } /* There are several queues associated with an endpoint that may * have some unsent fragments sitting in them. Remove them and * call the callback functions with an error so the PML can send * them down a different path. This really only needs to be called * once on an endpoint, but for now, just call it a bunch of times. * The first time through will remove the unsent fragments so * subsequent calls are no-ops. */ if (endpoint) { error_out_all_pending_frags(endpoint, &openib_btl->super, true); } }
/** * This function will find all the pending fragments on an endpoint * and call the callback function with OPAL_ERROR. It walks through * each qp with each priority and looks for both no_credits_pending_frags * and no_wqe_pending_frags. It then looks for any pending_lazy_frags, * pending_put_frags, and pending_get_frags. This function is only * called when running with failover support enabled. Note that * the errout parameter allows the function to also be used as a * debugging tool to see if there are any fragments on any of the * queues. * @param ep Pointer to endpoint that had error * @param module Pointer to module that had error * @param errout Boolean which says whether to error them out or not */ static void error_out_all_pending_frags(mca_btl_base_endpoint_t *ep, struct mca_btl_base_module_t* module, bool errout) { int qp, pri, len, total, btl_ownership; opal_list_item_t *item; mca_btl_openib_com_frag_t* frag; mca_btl_base_descriptor_t *des; int verbose = 10; /* Verbosity level unless debugging */ /* If debugging, drop verbosity level so we can see the output * regardless of the level the program was run with. */ if (false == errout) { verbose = 0; } total = 0; /* Traverse all QPs and all priorities and move to other endpoint */ for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { for (pri = 0; pri < 2; ++pri) { /* All types of qp's have a no_wqe_pending_frags list */ len = opal_list_get_size(&ep->qps[qp].no_wqe_pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for no_wqe_pending_frags qp=%d, " "pri=%d, list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->qps[qp]. no_wqe_pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in no_wqe_pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } if (BTL_OPENIB_QP_TYPE_PP(qp)) { len = opal_list_get_size(&ep->qps[qp].no_credits_pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for no_credits_pending_frags qp=%d, " "pri=%d, list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->qps[qp]. no_credits_pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in " "no_credits_pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } } else if (BTL_OPENIB_QP_TYPE_SRQ(qp)) { len = opal_list_get_size(&ep->endpoint_btl->qps[qp].u.srq_qp.pending_frags[pri]); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for srq pending_frags qp=%d, pri=%d, " "list size=%d", qp, pri, len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&ep->endpoint_btl->qps[qp]. u.srq_qp.pending_frags[pri]))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; /* Error out any coalesced frags if they exist */ if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_SEND) { opal_list_item_t *i; while((i = opal_list_remove_first(&to_send_frag(des)->coalesced_frags))) { opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Found coalesced frag in SRQ pending_frags"); btl_ownership = (to_base_frag(i)->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); to_base_frag(i)->base.des_cbfunc(module, ep, &to_base_frag(i)->base, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, &to_base_frag(i)->base); } } } btl_ownership = (des->des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP); des->des_cbfunc(module, ep, des, OPAL_ERROR); if( btl_ownership ) { mca_btl_openib_free(module, des); } } } } } } } /* Check for any frags from a connection that was never made. Not sure if this * can actually happen. */ len = opal_list_get_size(&ep->pending_lazy_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_lazy_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_lazy_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } len = opal_list_get_size(&ep->pending_put_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_put_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } len = opal_list_get_size(&ep->pending_get_frags); if (len > 0) { total += len; opal_output_verbose(verbose, mca_btl_openib_component.verbose_failover, "IB: Checking for pending_get_frags, list size=%d", len); if (true == errout) { while (NULL != (item = opal_list_remove_first(&(ep->pending_put_frags)))) { frag = (mca_btl_openib_com_frag_t *) item; des = (mca_btl_base_descriptor_t *)frag; des->des_cbfunc(module, ep, des, OPAL_ERROR); } } } opal_output_verbose(verbose + 30, mca_btl_openib_component.verbose_failover, "IB: Finished checking for pending_frags, total moved=%d", total); }
/* * called when the connect module has completed setup of an endpoint */ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) { opal_list_item_t *frag_item, *ep_item; mca_btl_openib_send_frag_t *frag; mca_btl_openib_endpoint_t *ep; bool master = false; opal_output(-1, "Now we are CONNECTED"); if (MCA_BTL_XRC_ENABLED) { OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock); if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) { /* We are not xrc master */ /* set our qp pointer to master qp */ master = false; } else { /* I'm master of XRC */ endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTED; master = true; } } /* Run over all qps and load alternative path */ if (APM_ENABLED) { int i; if (MCA_BTL_XRC_ENABLED) { if (master) { mca_btl_openib_load_apm(endpoint->ib_addr->qp->lcl_qp, endpoint); } } else { for(i = 0; i < mca_btl_openib_component.num_qps; i++) { mca_btl_openib_load_apm(endpoint->qps[i].qp->lcl_qp, endpoint); } } } endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; endpoint->endpoint_btl->device->non_eager_rdma_endpoints++; /* The connection is correctly setup. Now we can decrease the event trigger. */ opal_progress_event_users_decrement(); if(MCA_BTL_XRC_ENABLED) { if (master) { while (NULL != (ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep))) { ep = (mca_btl_openib_endpoint_t *)ep_item; if (OPAL_SUCCESS != opal_btl_openib_connect_base_start(endpoint->endpoint_local_cpc, ep)) { BTL_ERROR(("Failed to connect pending endpoint\n")); } } } OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock); } /* Process pending packet on the endpoint */ /* While there are frags in the list, process them */ while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) { frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags)); frag = to_send_frag(frag_item); /* We need to post this one */ if (OPAL_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) { BTL_ERROR(("Error posting send")); } } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); /* if upper layer called put or get before connection moved to connected * state then we restart them here */ mca_btl_openib_frag_progress_pending_put_get(endpoint, mca_btl_openib_component.rdma_qp); }