Beispiel #1
0
int pmixp_server_pp_send(int nodeid, int size)
{
	Buf buf = pmixp_server_buf_new();
	int rc;
	pmixp_ep_t ep;
	struct pp_cbdata *cbdata = xmalloc(sizeof(*cbdata));

	grow_buf(buf, size);
	ep.type = PMIXP_EP_NOIDEID;
	ep.ep.nodeid = nodeid;
	cbdata->buf = buf;
	cbdata->size = size;
	set_buf_offset(buf,get_buf_offset(buf) + size);
	rc = pmixp_server_send_nb(&ep, PMIXP_MSG_PINGPONG,
				  _pmixp_pp_count, buf, pingpong_complete,
				  (void*)cbdata);
	if (SLURM_SUCCESS != rc) {
		char *nodename = pmixp_info_job_host(nodeid);
		PMIXP_ERROR("Was unable to wait for the parent %s to "
			    "become alive",
			    nodename);
		xfree(nodename);
	}
	return rc;
}
Beispiel #2
0
static int _ring_forward_data(pmixp_coll_ring_ctx_t *coll_ctx, uint32_t contrib_id,
			      uint32_t hop_seq, void *data, size_t size)
{
	pmixp_coll_ring_msg_hdr_t hdr;
	pmixp_coll_t *coll = _ctx_get_coll(coll_ctx);
	pmixp_coll_ring_t *ring = &coll->state.ring;
	hdr.nodeid = coll->my_peerid;
	hdr.msgsize = size;
	hdr.seq = coll_ctx->seq;
	hdr.hop_seq = hop_seq;
	hdr.contrib_id = contrib_id;
	pmixp_ep_t *ep = (pmixp_ep_t*)xmalloc(sizeof(*ep));
	pmixp_coll_ring_cbdata_t *cbdata = NULL;
	uint32_t offset = 0;
	Buf buf = _get_fwd_buf(coll_ctx);
	int rc = SLURM_SUCCESS;


	pmixp_coll_ring_ctx_sanity_check(coll_ctx);

#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: transit data to nodeid=%d, seq=%d, hop=%d, size=%lu, contrib=%d",
		    coll_ctx, _ring_next_id(coll), hdr.seq,
		    hdr.hop_seq, hdr.msgsize, hdr.contrib_id);
#endif
	if (!buf) {
		rc = SLURM_ERROR;
		goto exit;
	}
	ep->type = PMIXP_EP_NOIDEID;
	ep->ep.nodeid = ring->next_peerid;

	/* pack ring info */
	_pack_coll_ring_info(coll, &hdr, buf);

	/* insert payload to buf */
	offset = get_buf_offset(buf);
	pmixp_server_buf_reserve(buf, size);
	memcpy(get_buf_data(buf) + offset, data, size);
	set_buf_offset(buf, offset + size);

	cbdata = xmalloc(sizeof(pmixp_coll_ring_cbdata_t));
	cbdata->buf = buf;
	cbdata->coll = coll;
	cbdata->coll_ctx = coll_ctx;
	cbdata->seq = coll_ctx->seq;
	rc = pmixp_server_send_nb(ep, PMIXP_MSG_RING, coll_ctx->seq, buf,
				  _ring_sent_cb, cbdata);
exit:
	return rc;
}
Beispiel #3
0
static int _progress_ufwd(pmixp_coll_t *coll)
{
	pmixp_ep_t ep[coll->chldrn_cnt];
	int ep_cnt = 0;
	int rc, i;
	char *nodename = NULL;
	pmixp_coll_cbdata_t *cbdata = NULL;

	xassert(PMIXP_COLL_UPFWD == coll->state);

	/* for some reasons doesnt switch to downfwd */

	switch (coll->ufwd_status) {
	case PMIXP_COLL_SND_FAILED:
		/* something went wrong with upward send.
		 * notify libpmix about that and abort
		 * collective */
		if (coll->cbfunc) {
			coll->cbfunc(PMIX_ERROR, NULL, 0, coll->cbdata,
				     NULL, NULL);
		}
		_reset_coll(coll);
		/* Don't need to do anything else */
		return false;
	case PMIXP_COLL_SND_ACTIVE:
		/* still waiting for the send completion */
		return false;
	case PMIXP_COLL_SND_DONE:
		if (coll->contrib_prnt) {
			/* all-set to go to the next stage */
			break;
		}
		return false;
	default:
		/* Should not happen, fatal error */
		abort();
	}

	/* We now can upward part for the next collective */
	_reset_coll_ufwd(coll);

	/* move to the next state */
	coll->state = PMIXP_COLL_DOWNFWD;
	coll->dfwd_status = PMIXP_COLL_SND_ACTIVE;
	if (!pmixp_info_srv_direct_conn()) {
		/* only root of the tree should get here */
		xassert(0 > coll->prnt_peerid);
		if (coll->chldrn_cnt) {
			/* We can run on just one node */
			ep[ep_cnt].type = PMIXP_EP_HLIST;
			ep[ep_cnt].ep.hostlist = coll->chldrn_str;
			ep_cnt++;
		}
	} else {
		for(i=0; i<coll->chldrn_cnt; i++){
			ep[i].type = PMIXP_EP_NOIDEID;
			ep[i].ep.nodeid = coll->chldrn_ids[i];
			ep_cnt++;
		}
	}

	/* We need to wait for ep_cnt send completions + the local callback */
	coll->dfwd_cb_wait = ep_cnt;

	if (ep_cnt || coll->cbfunc) {
		/* allocate the callback data */
		cbdata = xmalloc(sizeof(pmixp_coll_cbdata_t));
		cbdata->coll = coll;
		cbdata->seq = coll->seq;
		cbdata->refcntr = ep_cnt;
		if (coll->cbfunc) {
			cbdata->refcntr++;
		}
	}

	for(i=0; i < ep_cnt; i++){
		rc = pmixp_server_send_nb(&ep[i], PMIXP_MSG_FAN_OUT, coll->seq,
					  coll->dfwd_buf,
					  _dfwd_sent_cb, cbdata);

		if (SLURM_SUCCESS != rc) {
			if (PMIXP_EP_NOIDEID == ep[i].type){
				nodename = pmixp_info_job_host(ep[i].ep.nodeid);
				PMIXP_ERROR("Cannot send data (size = %lu), "
				    "to %s:%d",
				    (uint64_t) get_buf_offset(coll->dfwd_buf),
				    nodename, ep[i].ep.nodeid);
				xfree(nodename);
			} else {
				PMIXP_ERROR("Cannot send data (size = %lu), "
				    "to %s",
				    (uint64_t) get_buf_offset(coll->dfwd_buf),
				    ep[i].ep.hostlist);
			}
			coll->dfwd_status = PMIXP_COLL_SND_FAILED;
		}
#ifdef PMIXP_COLL_DEBUG
		if (PMIXP_EP_NOIDEID == ep[i].type) {
			nodename = pmixp_info_job_host(ep[i].ep.nodeid);
			PMIXP_DEBUG("%p: fwd to %s:%d, size = %lu",
				    coll, nodename, ep[i].ep.nodeid,
				    (uint64_t) get_buf_offset(coll->dfwd_buf));
			xfree(nodename);
		} else {
			PMIXP_DEBUG("%p: fwd to %s, size = %lu",
				    coll, ep[i].ep.hostlist,
				    (uint64_t) get_buf_offset(coll->dfwd_buf));
		}
#endif
	}

	if (coll->cbfunc) {
		char *data = get_buf_data(coll->dfwd_buf) + coll->dfwd_offset;
		size_t size = get_buf_offset(coll->dfwd_buf) -
				coll->dfwd_offset;
		coll->dfwd_cb_wait++;
		coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata,
			     _libpmix_cb, (void *)cbdata);
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: local delivery, size = %lu",
			    coll, (uint64_t)size);
#endif
	}

	/* events observed - need another iteration */
	return true;
}
Beispiel #4
0
static int _progress_collect(pmixp_coll_t *coll)
{
	pmixp_ep_t ep = {0};
	int rc;

	xassert(PMIXP_COLL_COLLECT == coll->state);

	ep.type = PMIXP_EP_NONE;
#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: state=%s, local=%d, child_cntr=%d",
		    coll, pmixp_coll_state2str(coll->state),
		    (int)coll->contrib_local, coll->contrib_children);
#endif
	/* lock the collective */
	pmixp_coll_sanity_check(coll);

	if (PMIXP_COLL_COLLECT != coll->state) {
		/* In case of race condition between libpmix and
		 * slurm threads we can be called
		 * after we moved to the next step. */
		return 0;
	}

	if (!coll->contrib_local ||
	    coll->contrib_children != coll->chldrn_cnt) {
		/* Not yet ready to go to the next step */
		return 0;
	}

	if (pmixp_info_srv_direct_conn()) {
		/* We will need to forward aggregated
		 * message back to our children */
		coll->state = PMIXP_COLL_UPFWD;
	} else {
		/* If we use SLURM API (SAPI) - intermediate nodes
		 * don't need to forward data as the root will do
		 * SAPI broadcast.
		 * So, only root has to go through the full UPFWD
		 * state and send the message back.
		 * Other procs have to go through other route. The reason for
		 * that is the fact that som of out children can receive bcast
		 * message early and initiate next collective. We need to handle
		 * that properly.
		 */
		if (0 > coll->prnt_peerid) {
			coll->state = PMIXP_COLL_UPFWD;
		} else {
			coll->state = PMIXP_COLL_UPFWD_WSC;
		}
	}

	/* The root of the collective will have parent_host == NULL */
	if (NULL != coll->prnt_host) {
		ep.type = PMIXP_EP_NOIDEID;
		ep.ep.nodeid = coll->prnt_peerid;
		coll->ufwd_status = PMIXP_COLL_SND_ACTIVE;
		PMIXP_DEBUG("%p: send data to %s:%d",
			    coll, coll->prnt_host, coll->prnt_peerid);
	} else {
		/* move data from input buffer to the output */
		char *dst, *src = get_buf_data(coll->ufwd_buf) +
				coll->ufwd_offset;
		size_t size = get_buf_offset(coll->ufwd_buf) -
				coll->ufwd_offset;
		pmixp_server_buf_reserve(coll->dfwd_buf, size);
		dst = get_buf_data(coll->dfwd_buf) + coll->dfwd_offset;
		memcpy(dst, src, size);
		set_buf_offset(coll->dfwd_buf, coll->dfwd_offset + size);
		/* no need to send */
		coll->ufwd_status = PMIXP_COLL_SND_DONE;
		/* this is root */
		coll->contrib_prnt = true;
	}

	if (PMIXP_EP_NONE != ep.type) {
		pmixp_coll_cbdata_t *cbdata;
		cbdata = xmalloc(sizeof(pmixp_coll_cbdata_t));
		cbdata->coll = coll;
		cbdata->seq = coll->seq;
		cbdata->refcntr = 1;
		char *nodename = coll->prnt_host;
		rc = pmixp_server_send_nb(&ep, PMIXP_MSG_FAN_IN, coll->seq,
					  coll->ufwd_buf,
					  _ufwd_sent_cb, cbdata);

		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR("Cannot send data (size = %lu), "
				    "to %s:%d",
				    (uint64_t) get_buf_offset(coll->ufwd_buf),
				    nodename, ep.ep.nodeid);
			coll->ufwd_status = PMIXP_COLL_SND_FAILED;
		}
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: fwd to %s:%d, size = %lu",
			    coll, nodename, ep.ep.nodeid,
			    (uint64_t) get_buf_offset(coll->dfwd_buf));
#endif
	}

	/* events observed - need another iteration */
	return true;
}