void mca_oob_ud_req_complete (mca_oob_ud_req_t *req, int rc) { int i; opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s oob:ud:req_complete %s request %p completed with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (req->type == MCA_OOB_UD_REQ_SEND) ? "SEND":"RECV", (void *) req, rc); if (NULL != req->req_qp) { (void) mca_oob_ud_qp_data_release (req->req_qp); req->req_qp = NULL; } /* deregister memory *before* handing it to the callback */ MCA_OOB_UD_REQ_DEREG_MR(req); switch (req->type) { case MCA_OOB_UD_REQ_SEND: if (req->req_data_type != MCA_OOB_UD_REQ_TR) { req->rml_msg->status = rc; } break; case MCA_OOB_UD_REQ_RECV: if ((req->req_target.jobid == ORTE_PROC_MY_NAME->jobid) && (req->req_target.vpid == ORTE_PROC_MY_NAME->vpid)) { opal_output_verbose(1, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec)); int datalen = 0; for (i = 0 ; i < req->req_data.iov.count; ++i) { memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len); datalen += req->req_data.iov.uiov[i].iov_len; } ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_seq_num, data, datalen); free(data); } else { ORTE_RML_POST_MESSAGE(&req->req_origin, req->req_tag, req->req_seq_num, req->req_data.buf.p, req->req_data.buf.size); } } else { opal_output_verbose(1, orte_oob_base_framework.framework_output, "%s UD PROMOTING ROUTED MESSAGE FOR %s TO OOB", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&req->req_target)); orte_rml_send_t *snd = OBJ_NEW(orte_rml_send_t); snd->dst = req->req_target; snd->origin = req->req_origin; snd->tag = req->req_tag; snd->seq_num = req->req_seq_num; if (MCA_OOB_UD_REQ_IOV == req->req_data_type) { char *data = (char *)calloc(req->req_data.iov.count, sizeof(struct iovec)); int datalen = 0; for (i = 0 ; i < req->req_data.iov.count; ++i) { memcpy (&data[datalen], req->req_data.iov.uiov[i].iov_base, req->req_data.iov.uiov[i].iov_len); datalen += req->req_data.iov.uiov[i].iov_len; } snd->data = data; snd->count = datalen; } else { char *data = (char *)calloc(req->req_data.buf.size, sizeof(char)); memcpy (data, req->req_data.buf.p, req->req_data.buf.size); snd->data = data; snd->count = req->req_data.buf.size; } snd->cbfunc.iov = NULL; snd->cbdata = NULL; /* activate the OOB send state */ ORTE_OOB_SEND(snd); } break; default: break; } mca_oob_ud_req_return (req); }
void mca_oob_tcp_recv_handler(int sd, short flags, void *cbdata) { mca_oob_tcp_peer_t* peer = (mca_oob_tcp_peer_t*)cbdata; int rc; orte_process_name_t hop; mca_oob_tcp_peer_t *relay; uint64_t ui64; if (orte_abnormal_term_ordered) { return; } opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_TCP_CONNECT_ACK: if (ORTE_SUCCESS == (rc = mca_oob_tcp_peer_recv_connect_ack(peer, peer->sd, NULL))) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler starting send/recv events", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* we connected! Start the send/recv events */ if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); peer->recv_ev_active = true; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } /* if there is a message waiting to be sent, queue it */ if (NULL == peer->send_msg) { peer->send_msg = (mca_oob_tcp_send_t*)opal_list_remove_first(&peer->send_queue); } if (NULL != peer->send_msg && !peer->send_ev_active) { opal_event_add(&peer->send_event, 0); peer->send_ev_active = true; } /* update our state */ peer->state = MCA_OOB_TCP_CONNECTED; } else { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); opal_event_del(&peer->recv_event); ORTE_FORCED_TERMINATE(1); return; } break; case MCA_OOB_TCP_CONNECTED: opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler CONNECTED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* allocate a new message and setup for recv */ if (NULL == peer->recv_msg) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler allocate new recv msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); peer->recv_msg = OBJ_NEW(mca_oob_tcp_recv_t); if (NULL == peer->recv_msg) { opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to allocate recv message\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); return; } /* start by reading the header */ peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr; peer->recv_msg->rdbytes = sizeof(mca_oob_tcp_hdr_t); } /* if the header hasn't been completely read, read it */ if (!peer->recv_msg->hdr_recvd) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler read hdr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (ORTE_SUCCESS == (rc = read_bytes(peer))) { OPAL_TIMING_EVENT((&tm,"Header received from %s", ORTE_NAME_PRINT(&peer->name))); /* completed reading the header */ peer->recv_msg->hdr_recvd = true; /* convert the header */ MCA_OOB_TCP_HDR_NTOH(&peer->recv_msg->hdr); /* if this is a zero-byte message, then we are done */ if (0 == peer->recv_msg->hdr.nbytes) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); peer->recv_msg->data = NULL; // make sure } else { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler allocate data region of size %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes); /* allocate the data region */ peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes); /* point to it */ peer->recv_msg->rdptr = peer->recv_msg->data; peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes; } /* fall thru and attempt to read the data */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { /* close the connection */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:tcp:recv:handler error reading bytes - closing connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_tcp_peer_close(peer); return; } } if (peer->recv_msg->hdr_recvd) { /* continue to read the data block - we start from * wherever we left off, which could be at the * beginning or somewhere in the message */ if (ORTE_SUCCESS == (rc = read_bytes(peer))) { OPAL_TIMING_EVENT((&tm,"Msg received from %s", ORTE_NAME_PRINT(&peer->name))); /* we recvd all of the message */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD COMPLETE MESSAGE FROM %s (ORIGIN %s) OF %d BYTES FOR DEST %s TAG %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin), (int)peer->recv_msg->hdr.nbytes, ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), peer->recv_msg->hdr.tag); /* am I the intended recipient (header was already converted back to host order)? */ if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); } else { /* no - find the next hop in the route */ hop = orte_routed.get_route(&peer->recv_msg->hdr.dst); if (hop.jobid == ORTE_JOBID_INVALID || hop.vpid == ORTE_VPID_INVALID) { /* no hop known - post the error to the component * and let the OOB see if there is another way * to get there from here */ opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s NO ROUTE TO %s FROM HERE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); /* let the component know about the problem */ ORTE_ACTIVATE_TCP_MSG_ERROR(NULL, peer->recv_msg, &hop, mca_oob_tcp_component_no_route); /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } else { /* does we know how to reach the next hop? */ memcpy(&ui64, (char*)&hop, sizeof(uint64_t)); if (OPAL_SUCCESS != opal_hash_table_get_value_uint64(&mca_oob_tcp_module.peers, ui64, (void**)&relay)) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s ADDRESS OF NEXT HOP %s TO %s IS UNKNOWN", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&hop), ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst)); /* let the component know about the problem */ ORTE_ACTIVATE_TCP_MSG_ERROR(NULL, peer->recv_msg, &hop, mca_oob_tcp_component_hop_unknown); /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s ROUTING TO %s FROM HERE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&relay->name)); /* if this came from a different job family, then ensure * we know how to return */ if (ORTE_JOB_FAMILY(peer->recv_msg->hdr.origin.jobid) != ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { orte_routed.update_route(&(peer->recv_msg->hdr.origin), &peer->name); } /* post the message for retransmission */ MCA_OOB_TCP_QUEUE_RELAY(peer->recv_msg, relay); OBJ_RELEASE(peer->recv_msg); } } peer->recv_msg = NULL; return; } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: unable to recv message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* turn off the recv event */ opal_event_del(&peer->recv_event); ORTE_FORCED_TERMINATE(1); return; } } break; default: opal_output(0, "%s-%s mca_oob_tcp_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); // mca_oob_tcp_peer_close(peer); break; } }
static void xcast_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tg, void* cbdata) { opal_list_item_t *item; orte_namelist_t *nm; int ret, cnt; opal_buffer_t *relay=NULL, *rly; orte_daemon_cmd_flag_t command = ORTE_DAEMON_NULL_CMD; opal_buffer_t wireup, datbuf, *data; opal_byte_object_t *bo; int8_t flag; orte_job_t *jdata; orte_proc_t *rec; opal_list_t coll; orte_grpcomm_signature_t *sig; orte_rml_tag_t tag; char *rtmod, *nidmap; size_t inlen, cmplen; uint8_t *packed_data, *cmpdata; int32_t nvals, i; opal_value_t kv, *kval; orte_process_name_t dmn; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:xcast:recv: with %d bytes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)buffer->bytes_used)); /* we need a passthru buffer to send to our children - we leave it * as compressed data */ rly = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(rly, buffer); OBJ_CONSTRUCT(&datbuf, opal_buffer_t); /* setup the relay list */ OBJ_CONSTRUCT(&coll, opal_list_t); /* unpack the flag to see if this payload is compressed */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } if (flag) { /* unpack the data size */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &inlen, &cnt, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* unpack the unpacked data size */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &cmplen, &cnt, OPAL_SIZE))) { ORTE_ERROR_LOG(ret); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* allocate the space */ packed_data = (uint8_t*)malloc(inlen); /* unpack the data blob */ cnt = inlen; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, packed_data, &cnt, OPAL_UINT8))) { ORTE_ERROR_LOG(ret); free(packed_data); ORTE_FORCED_TERMINATE(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); return; } /* decompress the data */ if (orte_util_uncompress_block(&cmpdata, cmplen, packed_data, inlen)) { /* the data has been uncompressed */ opal_dss.load(&datbuf, cmpdata, cmplen); data = &datbuf; } else { data = buffer; } free(packed_data); } else { data = buffer; } /* get the signature that we do not need */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &sig, &cnt, ORTE_SIGNATURE))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); ORTE_FORCED_TERMINATE(ret); return; } OBJ_RELEASE(sig); /* get the target tag */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &tag, &cnt, ORTE_RML_TAG))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&datbuf); OBJ_DESTRUCT(&coll); OBJ_RELEASE(rly); ORTE_FORCED_TERMINATE(ret); return; } /* get our conduit's routed module name */ rtmod = orte_rml.get_routed(orte_coll_conduit); /* if this is headed for the daemon command processor, * then we first need to check for add_local_procs * as that command includes some needed wireup info */ if (ORTE_RML_TAG_DAEMON == tag) { /* peek at the command */ cnt=1; if (ORTE_SUCCESS == (ret = opal_dss.unpack(data, &command, &cnt, ORTE_DAEMON_CMD))) { /* if it is an exit cmd, then flag that we are quitting so we will properly * handle connection losses from our downstream peers */ if (ORTE_DAEMON_EXIT_CMD == command || ORTE_DAEMON_HALT_VM_CMD == command) { orte_orteds_term_ordered = true; if (ORTE_DAEMON_HALT_VM_CMD == command) { /* this is an abnormal termination */ orte_abnormal_term_ordered = true; } /* copy the msg for relay to ourselves */ relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } opal_dss.copy_payload(relay, data); } else if (ORTE_DAEMON_ADD_LOCAL_PROCS == command || ORTE_DAEMON_DVM_NIDMAP_CMD == command || ORTE_DAEMON_DVM_ADD_PROCS == command) { /* setup our internal relay buffer */ relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } /* unpack the nidmap string - may be NULL */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &nidmap, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(ret); goto relay; } if (NULL != nidmap) { if (ORTE_SUCCESS != (ret = orte_regx.nidmap_parse(nidmap))) { ORTE_ERROR_LOG(ret); goto relay; } free(nidmap); } /* see if they included info on node capabilities */ cnt = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(data, &flag, &cnt, OPAL_INT8))) { ORTE_ERROR_LOG(ret); goto relay; } if (0 != flag) { /* update our local nidmap, if required - the decode function * knows what to do */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:xcast updating daemon nidmap", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (ret = orte_regx.decode_daemon_nodemap(data))) { ORTE_ERROR_LOG(ret); goto relay; } if (!ORTE_PROC_IS_HNP) { /* update the routing plan - the HNP already did * it when it computed the VM, so don't waste time * re-doing it here */ orte_routed.update_routing_plan(rtmod); } /* routing is now possible */ orte_routed_base.routing_enabled = true; /* unpack the byte object */ cnt=1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(data, &bo, &cnt, OPAL_BYTE_OBJECT))) { ORTE_ERROR_LOG(ret); goto relay; } if (0 < bo->size) { /* load it into a buffer */ OBJ_CONSTRUCT(&wireup, opal_buffer_t); opal_dss.load(&wireup, bo->bytes, bo->size); /* decode it, pushing the info into our database */ if (opal_pmix.legacy_get()) { OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = OPAL_PMIX_PROC_URI; kv.type = OPAL_STRING; cnt=1; while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kv.data.string, &cnt, OPAL_STRING))) { ORTE_ERROR_LOG(ret); break; } if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, &kv))) { ORTE_ERROR_LOG(ret); free(kv.data.string); break; } free(kv.data.string); kv.data.string = NULL; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { ORTE_ERROR_LOG(ret); } } else { cnt=1; while (OPAL_SUCCESS == (ret = opal_dss.unpack(&wireup, &dmn, &cnt, ORTE_NAME))) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &nvals, &cnt, OPAL_INT32))) { ORTE_ERROR_LOG(ret); break; } for (i=0; i < nvals; i++) { cnt = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&wireup, &kval, &cnt, OPAL_VALUE))) { ORTE_ERROR_LOG(ret); break; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s STORING MODEX DATA FOR PROC %s KEY %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&dmn), kval->key)); if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&dmn, kval))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(kval); break; } OBJ_RELEASE(kval); } } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != ret) { ORTE_ERROR_LOG(ret); } } /* done with the wireup buffer - dump it */ OBJ_DESTRUCT(&wireup); } free(bo); } /* copy the remainder of the payload - we don't pass wiring info * to the odls */ opal_dss.copy_payload(relay, data); } else { relay = OBJ_NEW(opal_buffer_t); /* repack the command */ if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(ret); goto relay; } /* copy the msg for relay to ourselves */ opal_dss.copy_payload(relay, data); } } else { ORTE_ERROR_LOG(ret); goto CLEANUP; } } else { /* copy the msg for relay to ourselves */ relay = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(relay, data); } relay: if (!orte_do_not_launch) { /* get the list of next recipients from the routed module */ orte_routed.get_routing_list(rtmod, &coll); /* if list is empty, no relay is required */ if (opal_list_is_empty(&coll)) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:send_relay - recipient list is empty!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CLEANUP; } /* send the message to each recipient on list, deconstructing it as we go */ while (NULL != (item = opal_list_remove_first(&coll))) { nm = (orte_namelist_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)rly->bytes_used, ORTE_NAME_PRINT(&nm->name))); OBJ_RETAIN(rly); /* check the state of the recipient - no point * sending to someone not alive */ jdata = orte_get_job_data_object(nm->name.jobid); if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) { if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name)); } OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } if ((ORTE_PROC_STATE_RUNNING < rec->state && ORTE_PROC_STATE_CALLED_ABORT != rec->state) || !ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) { if (!orte_abnormal_term_ordered && !orte_orteds_term_ordered) { opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay: %s ", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name), ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE) ? orte_proc_state_to_str(rec->state) : "NOT ALIVE"); } OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit, &nm->name, rly, ORTE_RML_TAG_XCAST, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(rly); OBJ_RELEASE(item); ORTE_FORCED_TERMINATE(ORTE_ERR_UNREACH); continue; } OBJ_RELEASE(item); } } CLEANUP: /* cleanup */ OPAL_LIST_DESTRUCT(&coll); OBJ_RELEASE(rly); // retain accounting /* now pass the relay buffer to myself for processing - don't * inject it into the RML system via send as that will compete * with the relay messages down in the OOB. Instead, pass it * directly to the RML message processor */ if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) { ORTE_RML_POST_MESSAGE(ORTE_PROC_MY_NAME, tag, 1, relay->base_ptr, relay->bytes_used); relay->base_ptr = NULL; relay->bytes_used = 0; } if (NULL != relay) { OBJ_RELEASE(relay); } OBJ_DESTRUCT(&datbuf); }
void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata) { mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata; int rc; orte_rml_send_t *snd; if (orte_abnormal_term_ordered) { return; } opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler called for peer %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); switch (peer->state) { case MCA_OOB_USOCK_CONNECT_ACK: if (ORTE_SUCCESS == (rc = mca_oob_usock_peer_recv_connect_ack(peer, peer->sd, NULL))) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler starting send/recv events", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* we connected! Start the send/recv events */ if (!peer->recv_ev_active) { opal_event_add(&peer->recv_event, 0); peer->recv_ev_active = true; } if (peer->timer_ev_active) { opal_event_del(&peer->timer_event); peer->timer_ev_active = false; } /* if there is a message waiting to be sent, queue it */ if (NULL == peer->send_msg) { peer->send_msg = (mca_oob_usock_send_t*)opal_list_remove_first(&peer->send_queue); } if (NULL != peer->send_msg && !peer->send_ev_active) { opal_event_add(&peer->send_event, 0); peer->send_ev_active = true; } /* update our state */ peer->state = MCA_OOB_USOCK_CONNECTED; } else { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s UNABLE TO COMPLETE CONNECT ACK WITH %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name)); opal_event_del(&peer->recv_event); peer->recv_ev_active = false; ORTE_FORCED_TERMINATE(1); return; } break; case MCA_OOB_USOCK_CONNECTED: opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler CONNECTED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); /* allocate a new message and setup for recv */ if (NULL == peer->recv_msg) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler allocate new recv msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); peer->recv_msg = OBJ_NEW(mca_oob_usock_recv_t); if (NULL == peer->recv_msg) { opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to allocate recv message\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); return; } /* start by reading the header */ peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr; peer->recv_msg->rdbytes = sizeof(mca_oob_usock_hdr_t); } /* if the header hasn't been completely read, read it */ if (!peer->recv_msg->hdr_recvd) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler read hdr", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (ORTE_SUCCESS == (rc = read_bytes(peer))) { /* completed reading the header */ peer->recv_msg->hdr_recvd = true; /* if this is a zero-byte message, then we are done */ if (0 == peer->recv_msg->hdr.nbytes) { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag); peer->recv_msg->data = NULL; // make sure peer->recv_msg->rdptr = NULL; peer->recv_msg->rdbytes = 0; } else { opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler allocate data region of size %lu", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes); /* allocate the data region */ peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes); /* point to it */ peer->recv_msg->rdptr = peer->recv_msg->data; peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes; } /* fall thru and attempt to read the data */ } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { /* close the connection */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s:usock:recv:handler error reading bytes - closing connection", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); mca_oob_usock_peer_close(peer); return; } } if (peer->recv_msg->hdr_recvd) { /* continue to read the data block - we start from * wherever we left off, which could be at the * beginning or somewhere in the message */ if (ORTE_SUCCESS == (rc = read_bytes(peer))) { /* we recvd all of the message */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s RECVD COMPLETE MESSAGE FROM %s OF %d BYTES FOR DEST %s TAG %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin), (int)peer->recv_msg->hdr.nbytes, ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst), peer->recv_msg->hdr.tag); /* am I the intended recipient? */ if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid && peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) { /* yes - post it to the RML for delivery */ opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s DELIVERING TO RML", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag, peer->recv_msg->hdr.seq_num, peer->recv_msg->data, peer->recv_msg->hdr.nbytes); OBJ_RELEASE(peer->recv_msg); } else { /* no - we don't route things, so we promote this * back to the OOB and let another transport move * it along. If we are a daemon and it is intended * for another of our local procs, it will just come * back to us and be handled then */ snd = OBJ_NEW(orte_rml_send_t); snd->dst = peer->recv_msg->hdr.dst; snd->origin = peer->recv_msg->hdr.origin; snd->tag = peer->recv_msg->hdr.tag; snd->data = peer->recv_msg->data; snd->seq_num = peer->recv_msg->hdr.seq_num; snd->count = peer->recv_msg->hdr.nbytes; snd->cbfunc.iov = NULL; snd->cbdata = NULL; /* activate the OOB send state */ ORTE_OOB_SEND(snd); /* protect the data */ peer->recv_msg->data = NULL; /* cleanup */ OBJ_RELEASE(peer->recv_msg); return; } } else if (ORTE_ERR_RESOURCE_BUSY == rc || ORTE_ERR_WOULD_BLOCK == rc) { /* exit this event and let the event lib progress */ return; } else { // report the error opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to recv message", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name))); /* turn off the recv event */ opal_event_del(&peer->recv_event); peer->recv_ev_active = false; ORTE_FORCED_TERMINATE(1); return; } } break; default: opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&(peer->name)), peer->state); // mca_oob_usock_peer_close(peer); break; } }