示例#1
0
int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr)
{
	char *nodename = NULL;
	int rc;

	if (hdr->nodeid != _ring_prev_id(coll)) {
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("%p: unexpected contrib from %s:%u, expected is %d",
			    coll, nodename, hdr->nodeid, _ring_prev_id(coll));
		return SLURM_ERROR;
	}
	rc = pmixp_coll_check(coll, hdr->seq);
	if (PMIXP_COLL_REQ_FAILURE == rc) {
		/* this is an unacceptable event: either something went
		 * really wrong or the state machine is incorrect.
		 * This will 100% lead to application hang.
		 */
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("Bad collective seq. #%d from %s:%u, current is %d",
			    hdr->seq, nodename, hdr->nodeid, coll->seq);
		pmixp_debug_hang(0); /* enable hang to debug this! */
		slurm_kill_job_step(pmixp_info_jobid(),
				    pmixp_info_stepid(), SIGKILL);
		xfree(nodename);
		return SLURM_SUCCESS;
	} else if (PMIXP_COLL_REQ_SKIP == rc) {
#ifdef PMIXP_COLL_DEBUG
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("Wrong collective seq. #%d from nodeid %u, current is %d, skip this message",
			    hdr->seq, hdr->nodeid, coll->seq);
#endif
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
示例#2
0
int pmixp_server_pp_send(int nodeid, int size)
{
	Buf buf = pmixp_server_buf_new();
	int rc;
	pmixp_ep_t ep;
	struct pp_cbdata *cbdata = xmalloc(sizeof(*cbdata));

	grow_buf(buf, size);
	ep.type = PMIXP_EP_NOIDEID;
	ep.ep.nodeid = nodeid;
	cbdata->buf = buf;
	cbdata->size = size;
	set_buf_offset(buf,get_buf_offset(buf) + size);
	rc = pmixp_server_send_nb(&ep, PMIXP_MSG_PINGPONG,
				  _pmixp_pp_count, buf, pingpong_complete,
				  (void*)cbdata);
	if (SLURM_SUCCESS != rc) {
		char *nodename = pmixp_info_job_host(nodeid);
		PMIXP_ERROR("Was unable to wait for the parent %s to "
			    "become alive",
			    nodename);
		xfree(nodename);
	}
	return rc;
}
示例#3
0
/*
 * Receive the first message identifying initiator
 */
static void
_direct_conn_establish(pmixp_conn_t *conn, void *_hdr, void *msg)
{
	pmixp_io_engine_t *eng = pmixp_conn_get_eng(conn);
	pmixp_base_hdr_t *hdr = (pmixp_base_hdr_t *)_hdr;
	pmixp_dconn_t *dconn = NULL;
	pmixp_conn_t *new_conn;
	eio_obj_t *obj;
	int fd;

	fd = pmixp_io_detach(eng);

	dconn = pmixp_dconn_accept(hdr->nodeid, fd);
	if (!dconn) {
		/* connection was refused because we already
		 * have established connection
		 * It seems that some sort of race condition occured
		 */
		char *nodename = pmixp_info_job_host(hdr->nodeid);
		close(fd);
		PMIXP_ERROR("Failed to accept direct connection from %s",
			    nodename);
		xfree(nodename);
		return;
	}
	new_conn = pmixp_conn_new_persist(PMIXP_PROTO_DIRECT,
					  pmixp_dconn_engine(dconn),
					  _direct_new_msg_conn,
					  _direct_return_connection, dconn);
	pmixp_dconn_unlock(dconn);
	obj = eio_obj_create(fd, &direct_peer_ops, (void *)new_conn);
	eio_new_obj(pmixp_info_io(), obj);
	/* wakeup this connection to get processed */
	eio_signal_wakeup(pmixp_info_io());
}
示例#4
0
static int _slurm_send(pmixp_ep_t *ep, pmixp_base_hdr_t bhdr, Buf buf)
{
	const char *addr = NULL, *data = NULL, *hostlist = NULL;
	char nhdr[PMIXP_BASE_HDR_MAX];
	size_t hsize = 0, dsize = 0;
	int rc;

	/* setup the header */
	addr = pmixp_info_srv_usock_path();

	bhdr.ext_flag = 0;
	if (pmixp_info_srv_direct_conn() && PMIXP_EP_NOIDEID == ep->type) {
		bhdr.ext_flag = 1;
	}

	hsize = _slurm_pack_hdr(&bhdr, nhdr);
	data = _buf_finalize(buf, nhdr, hsize, &dsize);

	switch( ep->type ){
	case PMIXP_EP_HLIST:
		hostlist = ep->ep.hostlist;
		rc = pmixp_stepd_send(ep->ep.hostlist, addr,
				      data, dsize, 500, 7, 0);
		break;
	case PMIXP_EP_NOIDEID: {
		char *nodename = pmixp_info_job_host(ep->ep.nodeid);
		rc = pmixp_p2p_send(nodename, addr, data, dsize,
				    500, 7, 0);
		xfree(nodename);
		break;
	}
	default:
		PMIXP_ERROR("Bad value of the EP type: %d", (int)ep->type);
		abort();
	}

	if (SLURM_SUCCESS != rc) {
		PMIXP_ERROR("Cannot send message to %s, size = %u, "
			    "hostlist:\n%s",
			    addr, (uint32_t) dsize, hostlist);
	}
	return rc;
}
示例#5
0
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf)
{
	int rc;

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmixp_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;
		int c_nodeid;

		rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid,
					    &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad message header from node %s",
				    nodename);
			xfree(nodename);
			goto exit;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from nodeid = %u, "
			    "type = %s, seq = %d",
			    hdr->nodeid,
			    ((PMIXP_MSG_FAN_IN == hdr->type) ?
				     "fan-in" : "fan-out"),
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad collective seq. #%d from %s, current"
				    " is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(),
					    pmixp_info_stepid(), SIGKILL);
			xfree(nodename);
			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from"
				    " nodeid %u, current is %d, skip "
				    "this message",
				    hdr->seq, hdr->nodeid, coll->seq);
			goto exit;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_child(coll, hdr->nodeid,
						 hdr->seq, buf);
		} else {
			pmixp_coll_contrib_parent(coll, hdr->nodeid,
						  hdr->seq, buf);
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq);
		/* buf will be free'd by the PMIx callback so
		 * protect the data by voiding the buffer.
		 * Use the statement below instead of (buf = NULL)
		 * to maintain incapsulation - in general `buf`is
		 * not a pointer, but opaque type.
		 */
		buf = create_buf(NULL, 0);
		break;
	}
	case PMIXP_MSG_INIT_DIRECT:
		PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid);
		break;
#ifndef NDEBUG
	case PMIXP_MSG_PINGPONG: {
		/* if the pingpong mode was activated -
		 * node 0 sends ping requests
		 * and receiver assumed to respond back to node 0
		 */
		int msize = remaining_buf(buf);

		if (pmixp_info_nodeid()) {
			pmixp_server_pp_send(0, msize);
		} else {
			if (pmixp_server_pp_same_thread()) {
				if (pmixp_server_pp_count() ==
				    pmixp_server_pp_warmups()) {
					pmixp_server_pp_start();
				}
				if (!pmixp_server_pp_check_fini(msize)) {
					pmixp_server_pp_send(1, msize);
				}
			}
		}
		pmixp_server_pp_inc();
		break;
	}
#endif
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}

exit:
	free_buf(buf);
}
示例#6
0
static int _progress_ufwd(pmixp_coll_t *coll)
{
	pmixp_ep_t ep[coll->chldrn_cnt];
	int ep_cnt = 0;
	int rc, i;
	char *nodename = NULL;
	pmixp_coll_cbdata_t *cbdata = NULL;

	xassert(PMIXP_COLL_UPFWD == coll->state);

	/* for some reasons doesnt switch to downfwd */

	switch (coll->ufwd_status) {
	case PMIXP_COLL_SND_FAILED:
		/* something went wrong with upward send.
		 * notify libpmix about that and abort
		 * collective */
		if (coll->cbfunc) {
			coll->cbfunc(PMIX_ERROR, NULL, 0, coll->cbdata,
				     NULL, NULL);
		}
		_reset_coll(coll);
		/* Don't need to do anything else */
		return false;
	case PMIXP_COLL_SND_ACTIVE:
		/* still waiting for the send completion */
		return false;
	case PMIXP_COLL_SND_DONE:
		if (coll->contrib_prnt) {
			/* all-set to go to the next stage */
			break;
		}
		return false;
	default:
		/* Should not happen, fatal error */
		abort();
	}

	/* We now can upward part for the next collective */
	_reset_coll_ufwd(coll);

	/* move to the next state */
	coll->state = PMIXP_COLL_DOWNFWD;
	coll->dfwd_status = PMIXP_COLL_SND_ACTIVE;
	if (!pmixp_info_srv_direct_conn()) {
		/* only root of the tree should get here */
		xassert(0 > coll->prnt_peerid);
		if (coll->chldrn_cnt) {
			/* We can run on just one node */
			ep[ep_cnt].type = PMIXP_EP_HLIST;
			ep[ep_cnt].ep.hostlist = coll->chldrn_str;
			ep_cnt++;
		}
	} else {
		for(i=0; i<coll->chldrn_cnt; i++){
			ep[i].type = PMIXP_EP_NOIDEID;
			ep[i].ep.nodeid = coll->chldrn_ids[i];
			ep_cnt++;
		}
	}

	/* We need to wait for ep_cnt send completions + the local callback */
	coll->dfwd_cb_wait = ep_cnt;

	if (ep_cnt || coll->cbfunc) {
		/* allocate the callback data */
		cbdata = xmalloc(sizeof(pmixp_coll_cbdata_t));
		cbdata->coll = coll;
		cbdata->seq = coll->seq;
		cbdata->refcntr = ep_cnt;
		if (coll->cbfunc) {
			cbdata->refcntr++;
		}
	}

	for(i=0; i < ep_cnt; i++){
		rc = pmixp_server_send_nb(&ep[i], PMIXP_MSG_FAN_OUT, coll->seq,
					  coll->dfwd_buf,
					  _dfwd_sent_cb, cbdata);

		if (SLURM_SUCCESS != rc) {
			if (PMIXP_EP_NOIDEID == ep[i].type){
				nodename = pmixp_info_job_host(ep[i].ep.nodeid);
				PMIXP_ERROR("Cannot send data (size = %lu), "
				    "to %s:%d",
				    (uint64_t) get_buf_offset(coll->dfwd_buf),
				    nodename, ep[i].ep.nodeid);
				xfree(nodename);
			} else {
				PMIXP_ERROR("Cannot send data (size = %lu), "
				    "to %s",
				    (uint64_t) get_buf_offset(coll->dfwd_buf),
				    ep[i].ep.hostlist);
			}
			coll->dfwd_status = PMIXP_COLL_SND_FAILED;
		}
#ifdef PMIXP_COLL_DEBUG
		if (PMIXP_EP_NOIDEID == ep[i].type) {
			nodename = pmixp_info_job_host(ep[i].ep.nodeid);
			PMIXP_DEBUG("%p: fwd to %s:%d, size = %lu",
				    coll, nodename, ep[i].ep.nodeid,
				    (uint64_t) get_buf_offset(coll->dfwd_buf));
			xfree(nodename);
		} else {
			PMIXP_DEBUG("%p: fwd to %s, size = %lu",
				    coll, ep[i].ep.hostlist,
				    (uint64_t) get_buf_offset(coll->dfwd_buf));
		}
#endif
	}

	if (coll->cbfunc) {
		char *data = get_buf_data(coll->dfwd_buf) + coll->dfwd_offset;
		size_t size = get_buf_offset(coll->dfwd_buf) -
				coll->dfwd_offset;
		coll->dfwd_cb_wait++;
		coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata,
			     _libpmix_cb, (void *)cbdata);
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: local delivery, size = %lu",
			    coll, (uint64_t)size);
#endif
	}

	/* events observed - need another iteration */
	return true;
}
示例#7
0
int pmixp_coll_contrib_parent(pmixp_coll_t *coll, uint32_t peerid,
			     uint32_t seq, Buf buf)
{
#ifdef PMIXP_COLL_DEBUG
	char *nodename = NULL;
	int lpeerid = -1;
#endif
	char *data_src = NULL, *data_dst = NULL;
	uint32_t size;
	int expected_peerid;

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	if (pmixp_info_srv_direct_conn()) {
		expected_peerid = coll->prnt_peerid;
	} else {
		expected_peerid = coll->root_peerid;
	}

	/* Sanity check */
	pmixp_coll_sanity_check(coll);
	if (expected_peerid != peerid) {
		char *nodename = pmixp_info_job_host(peerid);
		/* protect ourselfs if we are running with no asserts */
		PMIXP_ERROR("%p: parent contrib from bad nodeid=%s:%u, "
			    "expect=%d",
			    coll, nodename, peerid, expected_peerid);
		xfree(nodename);
		goto proceed;
	}

#ifdef PMIXP_COLL_DEBUG
	nodename = pmixp_info_job_host(peerid);
	lpeerid = hostlist_find(coll->peers_hl, nodename);
	/* Mark this event */
	PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d): state=%s, size=%u",
		    coll, nodename, peerid, lpeerid,
		    pmixp_coll_state2str(coll->state), remaining_buf(buf));
#endif

	switch (coll->state) {
	case PMIXP_COLL_SYNC:
	case PMIXP_COLL_COLLECT:
		/* It looks like a retransmission attempt when remote side
		 * identified transmission failure, but we actually successfuly
		 * received the message */
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: prev contrib from %s:%d(%d): "
			    "seq=%u, cur_seq=%u, state=%s",
			    coll, nodename, peerid, lpeerid,
			    seq, coll->seq,
			    pmixp_coll_state2str(coll->state));
#endif
		/* sanity check */
		if ((coll->seq - 1) != seq) {
			/* FATAL: should not happen in normal workflow */
			char *nodename = pmixp_info_job_host(peerid);
			PMIXP_ERROR("%p: unexpected contrib from %s:%d: "
				    "contrib_seq = %d, coll->seq = %d, "
				    "state=%s",
				    coll, nodename, peerid,
				    seq, coll->seq,
				    pmixp_coll_state2str(coll->state));
			xfree(nodename);
			xassert((coll->seq - 1) == seq);
			abort();
		}
		goto proceed;
	case PMIXP_COLL_UPFWD_WSC:{
		/* we are not actually ready to receive this contribution as
		 * the upward portion of the collective wasn't received yet.
		 * This should not happen as SAPI (SLURM API) is blocking and
		 * we chould transit to PMIXP_COLL_UPFWD_WPC immediately */
		/* FATAL: should not happen in normal workflow */
		char *nodename = pmixp_info_job_host(peerid);
		PMIXP_ERROR("%p: unexpected contrib from %s:%d: "
			    "contrib_seq = %d, coll->seq = %d, "
			    "state=%s",
			    coll, nodename, peerid,
			    seq, coll->seq,
			    pmixp_coll_state2str(coll->state));
		xfree(nodename);
		xassert((coll->seq - 1) == seq);
		abort();
	}
	case PMIXP_COLL_UPFWD:
	case PMIXP_COLL_UPFWD_WPC:
		/* we were waiting for this */
		break;
	case PMIXP_COLL_DOWNFWD:
		/* It looks like a retransmission attempt when remote side
		 * identified transmission failure, but we actually successfuly
		 * received the message */
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: double contrib from %s:%d(%d) "
			    "seq=%u, cur_seq=%u, state=%s",
			    coll, nodename, peerid, lpeerid,
			    seq, coll->seq, pmixp_coll_state2str(coll->state));
#endif
		/* sanity check */
		if (coll->seq != seq) {
			char *nodename = pmixp_info_job_host(peerid);
			/* FATAL: should not happen in normal workflow */
			PMIXP_ERROR("%p: unexpected contrib from %s:%d: "
				    "seq = %d, coll->seq = %d, state=%s",
				    coll, nodename, peerid,
				    seq, coll->seq,
				    pmixp_coll_state2str(coll->state));
			xassert((coll->seq - 1) == seq);
			xfree(nodename);
			abort();
		}
		goto proceed;
	default:
		/* should not happen in normal workflow */
		PMIXP_ERROR("%p: unknown collective state %s",
			    coll, pmixp_coll_state2str(coll->state));
		abort();
	}

	/* Because of possible timeouts/delays in transmission we
	 * can receive a contribution second time. Avoid duplications
	 * by checking our records. */
	if (coll->contrib_prnt) {
		char *nodename = pmixp_info_job_host(peerid);
		/* May be 0 or 1. If grater - transmission skew, ignore.
		 * NOTE: this output is not on the critical path -
		 * don't preprocess it out */
		PMIXP_DEBUG("%p: multiple contributions from parent %s:%d",
			    coll, nodename, peerid);
		xfree(nodename);
		/* this is duplication, skip. */
		goto proceed;
	}
	coll->contrib_prnt = true;

	data_src = get_buf_data(buf) + get_buf_offset(buf);
	size = remaining_buf(buf);
	pmixp_server_buf_reserve(coll->dfwd_buf, size);

	data_dst = get_buf_data(coll->dfwd_buf) +
			get_buf_offset(coll->dfwd_buf);
	memcpy(data_dst, data_src, size);
	set_buf_offset(coll->dfwd_buf,
		       get_buf_offset(coll->dfwd_buf) + size);
proceed:
	_progress_coll(coll);

#ifdef PMIXP_COLL_DEBUG
	if (nodename) {
		PMIXP_DEBUG("%p: finish: node=%s:%d(%d), state=%s",
			    coll, nodename, peerid, lpeerid,
			    pmixp_coll_state2str(coll->state));
		xfree(nodename);
	}
#endif
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	return SLURM_SUCCESS;
}
示例#8
0
int pmixp_coll_contrib_child(pmixp_coll_t *coll, uint32_t peerid,
			     uint32_t seq, Buf buf)
{
	char *data_src = NULL, *data_dst = NULL;
	uint32_t size;
	int chld_id;

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);
	pmixp_coll_sanity_check(coll);
	if (0 > (chld_id = _chld_id(coll, peerid))) {
		char *nodename = pmixp_info_job_host(peerid);
		char *avail_ids = _chld_ids_str(coll);
		PMIXP_DEBUG("%p: contribution from the non-child node "
			    "%s:%d, acceptable ids: %s",
			    coll, nodename, peerid, avail_ids);
		xfree(nodename);
		xfree(avail_ids);
	}

#ifdef PMIXP_COLL_DEBUG
	char *nodename = pmixp_info_job_host(peerid);
	int lpeerid = hostlist_find(coll->peers_hl, nodename);
	PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d:%d):, state=%s, size=%u",
		    coll, nodename, peerid, lpeerid, chld_id,
		    pmixp_coll_state2str(coll->state),
		    remaining_buf(buf));
#endif

	switch (coll->state) {
	case PMIXP_COLL_SYNC:
		/* change the state */
		coll->ts = time(NULL);
		/* fall-thru */
	case PMIXP_COLL_COLLECT:
		/* sanity check */
		if (coll->seq != seq) {
			char *nodename = pmixp_info_job_host(peerid);
			/* FATAL: should not happen in normal workflow */
			PMIXP_ERROR("%p: unexpected contrib from %s:%d "
				    "(child #%d) seq = %d, coll->seq = %d, "
				    "state=%s",
				    coll, nodename, peerid, chld_id,
				    seq, coll->seq,
				    pmixp_coll_state2str(coll->state));
			xassert(coll->seq == seq);
			abort();
		}
		break;
	case PMIXP_COLL_UPFWD:
	case PMIXP_COLL_UPFWD_WSC:
		/* FATAL: should not happen in normal workflow */
		PMIXP_ERROR("%p: unexpected contrib from %s:%d, state = %s",
			    coll, nodename, peerid,
			    pmixp_coll_state2str(coll->state));
		xassert(0);
		abort();
	case PMIXP_COLL_UPFWD_WPC:
	case PMIXP_COLL_DOWNFWD:
#ifdef PMIXP_COLL_DEBUG
		/* It looks like a retransmission attempt when remote side
		 * identified transmission failure, but we actually successfuly
		 * received the message */
		PMIXP_DEBUG("%p: contrib for the next collective "
			    "from=%s:%d(%d:%d) contrib_seq=%u, coll->seq=%u, "
			    "state=%s",
			    coll, nodename, peerid, lpeerid, chld_id,
			    seq, coll->seq, pmixp_coll_state2str(coll->state));
#endif
		if ((coll->seq +1) != seq) {
			char *nodename = pmixp_info_job_host(peerid);
			/* should not happen in normal workflow */
			PMIXP_ERROR("%p: unexpected contrib from %s:%d(x:%d) "
				    "seq = %d, coll->seq = %d, "
				    "state=%s",
				    coll, nodename, peerid, chld_id,
				    seq, coll->seq,
				    pmixp_coll_state2str(coll->state));
			xfree(nodename);
			xassert((coll->seq +1) == seq);
			abort();
		}
		break;
	default:
		/* should not happen in normal workflow */
		PMIXP_ERROR("%p: unknown collective state %s",
			    coll, pmixp_coll_state2str(coll->state));
		abort();
	}

	/* Because of possible timeouts/delays in transmission we
	 * can receive a contribution second time. Avoid duplications
	 * by checking our records. */
	if (coll->contrib_chld[chld_id]) {
		char *nodename = pmixp_info_job_host(peerid);
		/* May be 0 or 1. If grater - transmission skew, ignore.
		 * NOTE: this output is not on the critical path -
		 * don't preprocess it out */
		PMIXP_DEBUG("%p: multiple contribs from %s:%d(x:%d)",
			    coll, nodename, peerid, chld_id);
		/* this is duplication, skip. */
		xfree(nodename);
		goto proceed;
	}

	data_src = get_buf_data(buf) + get_buf_offset(buf);
	size = remaining_buf(buf);
	pmixp_server_buf_reserve(coll->ufwd_buf, size);
	data_dst = get_buf_data(coll->ufwd_buf) +
			get_buf_offset(coll->ufwd_buf);
	memcpy(data_dst, data_src, size);
	set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size);

	/* increase number of individual contributions */
	coll->contrib_chld[chld_id] = true;
	/* increase number of total contributions */
	coll->contrib_children++;

proceed:
	_progress_coll(coll);

#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: finish: node=%s:%d(%d:%d), state=%s",
		    coll, nodename, peerid, lpeerid, chld_id,
		    pmixp_coll_state2str(coll->state));
	xfree(nodename);
#endif
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	return SLURM_SUCCESS;
}
static void _process_server_request(recv_header_t *_hdr, void *payload)
{
	send_header_t *hdr = &_hdr->send_hdr;
	char *nodename = pmixp_info_job_host(hdr->nodeid);
	Buf buf;
	int rc;

	buf = create_buf(payload, hdr->msgsize);

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmix_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;

		rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR("Bad message header from node %s", nodename);
			return;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d",
			    nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out",
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq, nodename);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(),
					    SIGKILL);

			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message",
				    hdr->seq, nodename, coll->seq);
			free_buf(buf);
			break;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_node(coll, nodename, buf);
			/* we don't need this buffer anymore */
			free_buf(buf);
		} else {
			pmixp_coll_bcast(coll, buf);
			/* buf will be free'd by the PMIx callback */
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, nodename, hdr->seq);
		break;
	}
	case PMIXP_MSG_HEALTH_CHK: {
		/* this is just health ping.
		 * TODO: can we do something more sophisticated?
		 */
		free_buf(buf);
		break;
	}
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}
	xfree(nodename);
}
示例#10
0
void pmixp_coll_ring_log(pmixp_coll_t *coll)
{
	int i;
	pmixp_coll_ring_t *ring = &coll->state.ring;
	char *nodename, *next, *prev;
	char *out_str = NULL;

	PMIXP_ERROR("%p: %s state seq=%d",
		    coll, pmixp_coll_type2str(coll->type), coll->seq);
	nodename = pmixp_info_job_host(coll->my_peerid);
	PMIXP_ERROR("my peerid: %d:%s", coll->my_peerid, nodename);
	xfree(nodename);

	next = pmixp_info_job_host(_ring_next_id(coll));
	prev = pmixp_info_job_host(_ring_prev_id(coll));
	xstrfmtcat(out_str,"neighbor id: next %d:%s, prev %d:%s",
		   _ring_next_id(coll), next, _ring_prev_id(coll), prev);
	PMIXP_ERROR("%s", out_str);
	xfree(next);
	xfree(prev);
	xfree(out_str);


	for (i = 0; i < PMIXP_COLL_RING_CTX_NUM; i++) {
		pmixp_coll_ring_ctx_t *coll_ctx = &ring->ctx_array[i];

		PMIXP_ERROR("Context ptr=%p, #%d, in-use=%d",
			    coll_ctx, i, coll_ctx->in_use);

		if (coll_ctx->in_use) {
			int id;
			char *done_contrib, *wait_contrib;
			hostlist_t hl_done_contrib, hl_wait_contrib;

			pmixp_hostset_from_ranges(coll->pset.procs,
						  coll->pset.nprocs,
						  &hl_done_contrib);
			hl_wait_contrib = hostlist_copy(hl_done_contrib);

			PMIXP_ERROR("\t seq=%d contribs: loc=%d/prev=%d/fwd=%d",
				    coll_ctx->seq, coll_ctx->contrib_local,
				    coll_ctx->contrib_prev,
				    coll_ctx->forward_cnt);
			PMIXP_ERROR("\t neighbor contribs [%d]:",
				    coll->peers_cnt);

			for (id = 0; id < coll->peers_cnt; id++) {
				char *nodename = pmixp_info_job_host(id);

				if(coll_ctx->contrib_map[id]) {
					hostlist_delete_host(hl_wait_contrib,
							     nodename);
				} else {
					hostlist_delete_host(hl_done_contrib,
							     nodename);
				}
				xfree(nodename);
			}
			done_contrib = slurm_hostlist_ranged_string_xmalloc(
				hl_done_contrib);
			wait_contrib = slurm_hostlist_ranged_string_xmalloc(
				hl_wait_contrib);
			PMIXP_ERROR("\t done contrib: %s",
				    strlen(done_contrib) ? done_contrib : "-");
			PMIXP_ERROR("\t wait contrib: %s",
				    strlen(wait_contrib) ? wait_contrib : "-");
			PMIXP_ERROR("\t status=%s",
				    pmixp_coll_ring_state2str(coll_ctx->state));
			PMIXP_ERROR("\t buf size=%u, remain=%u",
				    size_buf(coll_ctx->ring_buf),
				    remaining_buf(coll_ctx->ring_buf));
			xfree(done_contrib);
			xfree(wait_contrib);
			hostlist_destroy(hl_done_contrib);
			hostlist_destroy(hl_wait_contrib);
		}
	}
}