Example #1
0
static int _copy_payload(Buf inbuf, size_t offs, Buf *outbuf)
{
	size_t total_size, copy_size;
	char *ptr;
	pmix_proc_t *procs = NULL;
	size_t nprocs = 0;
	pmixp_coll_type_t type = 0;
	Buf buf;

	total_size = get_buf_offset(inbuf);
	set_buf_offset(inbuf, offs);
	int rc = pmixp_coll_unpack_ranges(inbuf, &type, &procs, &nprocs);
	xfree(procs);
	ptr = get_buf_data(inbuf) + get_buf_offset(inbuf);
	copy_size = total_size - get_buf_offset(inbuf);
	buf = init_buf(copy_size);
	memcpy(get_buf_data(buf), ptr, copy_size);
	*outbuf = buf;
	set_buf_offset(inbuf, total_size);
	return rc;
}
static void _process_server_request(recv_header_t *_hdr, void *payload)
{
	send_header_t *hdr = &_hdr->send_hdr;
	char *nodename = pmixp_info_job_host(hdr->nodeid);
	Buf buf;
	int rc;

	buf = create_buf(payload, hdr->msgsize);

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmix_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;

		rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR("Bad message header from node %s", nodename);
			return;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d",
			    nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out",
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq, nodename);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(),
					    SIGKILL);

			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message",
				    hdr->seq, nodename, coll->seq);
			free_buf(buf);
			break;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_node(coll, nodename, buf);
			/* we don't need this buffer anymore */
			free_buf(buf);
		} else {
			pmixp_coll_bcast(coll, buf);
			/* buf will be free'd by the PMIx callback */
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, nodename, hdr->seq);
		break;
	}
	case PMIXP_MSG_HEALTH_CHK: {
		/* this is just health ping.
		 * TODO: can we do something more sophisticated?
		 */
		free_buf(buf);
		break;
	}
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}
	xfree(nodename);
}