static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf) { int rc; switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmixp_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; int c_nodeid; rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid, &procs, &nprocs); if (SLURM_SUCCESS != rc) { char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad message header from node %s", nodename); xfree(nodename); goto exit; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from nodeid = %u, " "type = %s, seq = %d", hdr->nodeid, ((PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out"), hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ char *nodename = pmixp_info_job_host(hdr->nodeid); PMIXP_ERROR("Bad collective seq. #%d from %s, current" " is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); xfree(nodename); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from" " nodeid %u, current is %d, skip " "this message", hdr->seq, hdr->nodeid, coll->seq); goto exit; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_child(coll, hdr->nodeid, hdr->seq, buf); } else { pmixp_coll_contrib_parent(coll, hdr->nodeid, hdr->seq, buf); } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq); /* buf will be free'd by the PMIx callback so * protect the data by voiding the buffer. * Use the statement below instead of (buf = NULL) * to maintain incapsulation - in general `buf`is * not a pointer, but opaque type. */ buf = create_buf(NULL, 0); break; } case PMIXP_MSG_INIT_DIRECT: PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid); break; #ifndef NDEBUG case PMIXP_MSG_PINGPONG: { /* if the pingpong mode was activated - * node 0 sends ping requests * and receiver assumed to respond back to node 0 */ int msize = remaining_buf(buf); if (pmixp_info_nodeid()) { pmixp_server_pp_send(0, msize); } else { if (pmixp_server_pp_same_thread()) { if (pmixp_server_pp_count() == pmixp_server_pp_warmups()) { pmixp_server_pp_start(); } if (!pmixp_server_pp_check_fini(msize)) { pmixp_server_pp_send(1, msize); } } } pmixp_server_pp_inc(); break; } #endif default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } exit: free_buf(buf); }
static void _process_server_request(recv_header_t *_hdr, void *payload) { send_header_t *hdr = &_hdr->send_hdr; char *nodename = pmixp_info_job_host(hdr->nodeid); Buf buf; int rc; buf = create_buf(payload, hdr->msgsize); switch (hdr->type) { case PMIXP_MSG_FAN_IN: case PMIXP_MSG_FAN_OUT: { pmixp_coll_t *coll; pmix_proc_t *procs = NULL; size_t nprocs = 0; pmixp_coll_type_t type = 0; rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs); if (SLURM_SUCCESS != rc) { PMIXP_ERROR("Bad message header from node %s", nodename); return; } coll = pmixp_state_coll_get(type, procs, nprocs); xfree(procs); PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d", nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out", hdr->seq); rc = pmixp_coll_check_seq(coll, hdr->seq, nodename); if (PMIXP_COLL_REQ_FAILURE == rc) { /* this is unexepable event: either something went * really wrong or the state machine is incorrect. * This will 100% lead to application hang. */ PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d", hdr->seq, nodename, coll->seq); pmixp_debug_hang(0); /* enable hang to debug this! */ slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); break; } else if (PMIXP_COLL_REQ_SKIP == rc) { PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message", hdr->seq, nodename, coll->seq); free_buf(buf); break; } if (PMIXP_MSG_FAN_IN == hdr->type) { pmixp_coll_contrib_node(coll, nodename, buf); /* we don't need this buffer anymore */ free_buf(buf); } else { pmixp_coll_bcast(coll, buf); /* buf will be free'd by the PMIx callback */ } break; } case PMIXP_MSG_DMDX: { pmixp_dmdx_process(buf, nodename, hdr->seq); break; } case PMIXP_MSG_HEALTH_CHK: { /* this is just health ping. * TODO: can we do something more sophisticated? */ free_buf(buf); break; } default: PMIXP_ERROR("Unknown message type %d", hdr->type); break; } xfree(nodename); }