/* * Non blocking RML recv callback. Read incoming QP and other info, * and if this endpoint is trying to connect, reply with our QP info, * otherwise try to modify QP's and establish reliable connection */ static void xoob_rml_recv_cb(int status, ompi_process_name_t* process_name, opal_buffer_t* buffer, ompi_rml_tag_t tag, void* cbdata) { int rc; uint8_t message_type; uint16_t requested_lid = 0; mca_btl_openib_rem_info_t rem_info; mca_btl_openib_endpoint_t *ib_endpoint = NULL; if ( OMPI_SUCCESS != init_rem_info(&rem_info)) { return; } /* Get data. */ if ( OMPI_SUCCESS != xoob_receive_connect_data(&rem_info, &requested_lid, &message_type, buffer)) { BTL_ERROR(("Failed to read data\n")); mca_btl_openib_endpoint_invoke_error(NULL); return; } /* Processing message */ switch (message_type) { case ENDPOINT_XOOB_CONNECT_REQUEST: BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_REQUEST: lid %d, sid %" PRIx64 ", rlid %d\n", rem_info.rem_lid, rem_info.rem_subnet_id, requested_lid)); ib_endpoint = xoob_find_endpoint(process_name,rem_info.rem_subnet_id, requested_lid, message_type); if ( NULL == ib_endpoint) { BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_REQUEST." " Failed to find endpoint with subnet %" PRIx64 " and LID %d", rem_info.rem_subnet_id,requested_lid)); mca_btl_openib_endpoint_invoke_error(NULL); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); /* prepost data on receiver site */ if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) { BTL_ERROR(("Failed to post on XRC SRQs")); mca_btl_openib_endpoint_invoke_error(NULL); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); return; } /* we should create qp and send the info + srq to requestor */ rc = xoob_reply_first_connect(ib_endpoint, &rem_info); if (OMPI_SUCCESS != rc) { BTL_ERROR(("error in endpoint reply start connect")); mca_btl_openib_endpoint_invoke_error(NULL); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); return; } /* enable pooling for this btl */ OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); break; case ENDPOINT_XOOB_CONNECT_XRC_REQUEST: /* pasha we don't need the remote lid here ??*/ BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_REQUEST: lid %d, sid %" PRIx64 "\n", rem_info.rem_lid, rem_info.rem_subnet_id)); ib_endpoint = xoob_find_endpoint(process_name,rem_info.rem_subnet_id, requested_lid, message_type); if ( NULL == ib_endpoint) { BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_REQUEST." " Failed to find endpoint with subnet %" PRIx64 " and LID %d", rem_info.rem_subnet_id,requested_lid)); mca_btl_openib_endpoint_invoke_error(NULL); return; } if (OMPI_SUCCESS == xoob_recv_qp_connect(ib_endpoint, &rem_info)) { if (OMPI_SUCCESS != mca_btl_openib_endpoint_post_recvs(ib_endpoint)) { BTL_ERROR(("Failed to post on XRC SRQs")); mca_btl_openib_endpoint_invoke_error(ib_endpoint); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_RESPONSE); if (OMPI_SUCCESS != rc) { BTL_ERROR(("error in endpoint reply start connect")); mca_btl_openib_endpoint_invoke_error(ib_endpoint); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); return; } OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); } else { /* The XRC recv qp was destroyed */ OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); rc = xoob_send_connect_data(ib_endpoint, ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE); if (OMPI_SUCCESS != rc) { BTL_ERROR(("error in endpoint reply start connect")); mca_btl_openib_endpoint_invoke_error(ib_endpoint); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); return; } OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); } /* enable pooling for this btl */ break; case ENDPOINT_XOOB_CONNECT_RESPONSE: BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_RESPONSE: lid %d, sid %" PRIx64 "\n", rem_info.rem_lid, rem_info.rem_subnet_id)); ib_endpoint = xoob_find_endpoint(process_name, rem_info.rem_subnet_id, rem_info.rem_lid, message_type); if ( NULL == ib_endpoint) { BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_RESPONSE." " Failed to find endpoint with subnet %" PRIx64 " and LID %d", rem_info.rem_subnet_id,rem_info.rem_lid)); mca_btl_openib_endpoint_invoke_error(NULL); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); /* we got all the data srq. switch the endpoint to connect mode */ XOOB_SET_REMOTE_INFO(ib_endpoint->rem_info, rem_info); /* update ib_addr with remote qp number */ ib_endpoint->ib_addr->remote_xrc_rcv_qp_num = ib_endpoint->rem_info.rem_qps->rem_qp_num; BTL_VERBOSE(("rem_info: lid %d, sid %" PRIx64 " ep %d %" PRIx64 "\n", rem_info.rem_lid, rem_info.rem_subnet_id, ib_endpoint->rem_info.rem_lid, ib_endpoint->rem_info.rem_subnet_id)); if (OMPI_SUCCESS != xoob_send_qp_connect(ib_endpoint, &rem_info)) { BTL_ERROR(("Failed to connect endpoint\n")); mca_btl_openib_endpoint_invoke_error(NULL); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); return; } mca_btl_openib_endpoint_cpc_complete(ib_endpoint); /* cpc complete unlock the endpoint */ break; case ENDPOINT_XOOB_CONNECT_XRC_RESPONSE: BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_RESPONSE: lid %d, sid %" PRIx64 "\n", rem_info.rem_lid, rem_info.rem_subnet_id)); ib_endpoint = xoob_find_endpoint(process_name, rem_info.rem_subnet_id, rem_info.rem_lid, message_type); if ( NULL == ib_endpoint) { BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_RESPONSE." " Failed to find endpoint with subnet %" PRIx64 " and LID %d", rem_info.rem_subnet_id,rem_info.rem_lid)); mca_btl_openib_endpoint_invoke_error(NULL); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); /* we got srq numbers on our request */ XOOB_SET_REMOTE_INFO(ib_endpoint->rem_info, rem_info); mca_btl_openib_endpoint_cpc_complete(ib_endpoint); /* cpc complete unlock the endpoint */ break; case ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE: /* The XRC recv site already was destroyed so we need * start to bringup the connection from scratch */ BTL_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE: lid %d, sid %" PRIx64 "\n", rem_info.rem_lid, rem_info.rem_subnet_id)); ib_endpoint = xoob_find_endpoint(process_name, rem_info.rem_subnet_id, rem_info.rem_lid, message_type); if ( NULL == ib_endpoint) { BTL_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE." " Failed to find endpoint with subnet %" PRIx64 " and LID %d", rem_info.rem_subnet_id,rem_info.rem_lid)); mca_btl_openib_endpoint_invoke_error(NULL); return; } xoob_restart_connect(ib_endpoint); break; default : BTL_ERROR(("Invalid message type %d", message_type)); } free_rem_info(&rem_info); }
/* * Non blocking RML recv callback. Read incoming QP and other info, * and if this endpoint is trying to connect, reply with our QP info, * otherwise try to modify QP's and establish reliable connection */ static void rml_recv_cb(int status, orte_process_name_t* process_name, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { mca_btl_openib_proc_t *ib_proc; mca_btl_openib_endpoint_t *ib_endpoint = NULL; int endpoint_state; int rc; uint32_t i, lcl_qp = 0; uint16_t lcl_lid = 0; int32_t cnt = 1; mca_btl_openib_rem_info_t rem_info; uint8_t message_type; bool master; /* start by unpacking data first so we know who is knocking at our door */ BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8)); rc = opal_dss.unpack(buffer, &message_type, &cnt, OPAL_UINT8); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT64)); rc = opal_dss.unpack(buffer, &rem_info.rem_subnet_id, &cnt, OPAL_UINT64); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } if (ENDPOINT_CONNECT_REQUEST != message_type) { BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); rc = opal_dss.unpack(buffer, &lcl_qp, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16)); rc = opal_dss.unpack(buffer, &lcl_lid, &cnt, OPAL_UINT16); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } } if (ENDPOINT_CONNECT_ACK != message_type) { int qp; /* get ready for the data */ rem_info.rem_qps = (mca_btl_openib_rem_qp_info_t*) malloc(sizeof(mca_btl_openib_rem_qp_info_t) * mca_btl_openib_component.num_qps); /* unpack all the qp info */ for (qp = 0; qp < mca_btl_openib_component.num_qps; ++qp) { BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); rc = opal_dss.unpack(buffer, &rem_info.rem_qps[qp].rem_qp_num, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); rc = opal_dss.unpack(buffer, &rem_info.rem_qps[qp].rem_psn, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16)); rc = opal_dss.unpack(buffer, &rem_info.rem_lid, &cnt, OPAL_UINT16); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); rc = opal_dss.unpack(buffer, &rem_info.rem_mtu, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } BTL_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); rc = opal_dss.unpack(buffer, &rem_info.rem_index, &cnt, OPAL_UINT32); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); mca_btl_openib_endpoint_invoke_error(NULL); return; } } BTL_VERBOSE(("Received QP Info, LID = %d, SUBNET = %016x\n", rem_info.rem_lid, rem_info.rem_subnet_id)); master = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, process_name) > 0 ? true : false; /* Need to protect the ib_procs list */ OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock); for (ib_proc = (mca_btl_openib_proc_t*) opal_list_get_first(&mca_btl_openib_component.ib_procs); ib_proc != (mca_btl_openib_proc_t*) opal_list_get_end(&mca_btl_openib_component.ib_procs); ib_proc = (mca_btl_openib_proc_t*)opal_list_get_next(ib_proc)) { bool found = false; if (orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &ib_proc->proc_guid, process_name) != OPAL_EQUAL) { continue; } if (ENDPOINT_CONNECT_REQUEST != message_type) { /* This is a reply message. Try to get the endpoint instance the reply belongs to */ for (i = 0; i < ib_proc->proc_endpoint_count; i++) { ib_endpoint = ib_proc->proc_endpoints[i]; if (ib_endpoint->qps[0].qp->lcl_qp != NULL && lcl_lid == ib_endpoint->endpoint_btl->lid && lcl_qp == ib_endpoint->qps[0].qp->lcl_qp->qp_num && rem_info.rem_subnet_id == ib_endpoint->subnet_id) { found = true; break; } } } else { /* This is new connection request. If this is master try to find endpoint in a connecting state. If this is slave try to find endpoint in closed state and initiate connection back */ mca_btl_openib_endpoint_t *ib_endpoint_found = NULL; for (i = 0; i < ib_proc->proc_endpoint_count; i++) { ib_endpoint = ib_proc->proc_endpoints[i]; if (ib_endpoint->subnet_id != rem_info.rem_subnet_id || (ib_endpoint->endpoint_state != MCA_BTL_IB_CONNECTING && ib_endpoint->endpoint_state != MCA_BTL_IB_CLOSED)) continue; found = true; ib_endpoint_found = ib_endpoint; if ((master && MCA_BTL_IB_CONNECTING == ib_endpoint->endpoint_state) || (!master && MCA_BTL_IB_CLOSED == ib_endpoint->endpoint_state)) break; /* Found one. No point to continue */ } ib_endpoint = ib_endpoint_found; /* if this is slave and there is no endpoints in closed state then all connection are already in progress so just ignore this connection request */ if (found && !master && MCA_BTL_IB_CLOSED != ib_endpoint->endpoint_state) { OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); return; } } if (!found) { BTL_ERROR(("can't find suitable endpoint for this peer\n")); mca_btl_openib_endpoint_invoke_error(NULL); OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); return; } OPAL_THREAD_LOCK(&ib_endpoint->endpoint_lock); endpoint_state = ib_endpoint->endpoint_state; /* Update status */ switch (endpoint_state) { case MCA_BTL_IB_CLOSED : /* We had this connection closed before. The endpoint is trying to connect. Move the status of this connection to CONNECTING, and then reply with our QP information */ if (master) { rc = reply_start_connect(ib_endpoint, &rem_info); } else { rc = oob_module_start_connect(ib_endpoint->endpoint_local_cpc, ib_endpoint); } if (OMPI_SUCCESS != rc) { BTL_ERROR(("error in endpoint reply start connect")); mca_btl_openib_endpoint_invoke_error(ib_endpoint); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); break; } /* As long as we expect a message from the peer (in order to setup the connection) let the event engine pool the RML events. Note: we increment it once peer active connection. */ opal_progress_event_users_increment(); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); break; case MCA_BTL_IB_CONNECTING : set_remote_info(ib_endpoint, &rem_info); if (OMPI_SUCCESS != (rc = qp_connect_all(ib_endpoint))) { BTL_ERROR(("endpoint connect error: %d", rc)); mca_btl_openib_endpoint_invoke_error(ib_endpoint); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); break; } if (master) { ib_endpoint->endpoint_state = MCA_BTL_IB_WAITING_ACK; /* Send him an ACK */ send_connect_data(ib_endpoint, ENDPOINT_CONNECT_RESPONSE); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); } else { send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK); /* Tell main BTL that we're done */ mca_btl_openib_endpoint_cpc_complete(ib_endpoint); /* cpc complete unlock the endpoint */ } break; case MCA_BTL_IB_WAITING_ACK: /* Tell main BTL that we're done */ mca_btl_openib_endpoint_cpc_complete(ib_endpoint); /* cpc complete unlock the endpoint */ break; case MCA_BTL_IB_CONNECT_ACK: send_connect_data(ib_endpoint, ENDPOINT_CONNECT_ACK); /* Tell main BTL that we're done */ mca_btl_openib_endpoint_cpc_complete(ib_endpoint); /* cpc complete unlock the endpoint */ break; case MCA_BTL_IB_CONNECTED: OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); break; default : BTL_ERROR(("Invalid endpoint state %d", endpoint_state)); mca_btl_openib_endpoint_invoke_error(ib_endpoint); OPAL_THREAD_UNLOCK(&ib_endpoint->endpoint_lock); } break; } OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); }