Esempio n. 1
0
static inline void pmixp_coll_ring_ctx_sanity_check(
	pmixp_coll_ring_ctx_t *coll_ctx)
{
	xassert(NULL != coll_ctx);
	xassert(coll_ctx->in_use);
	pmixp_coll_sanity_check(coll_ctx->coll);
}
Esempio n. 2
0
int pmixp_coll_ring_local(pmixp_coll_t *coll, char *data, size_t size,
			  void *cbfunc, void *cbdata)
{
	int ret = SLURM_SUCCESS;
	pmixp_coll_ring_ctx_t *coll_ctx = NULL;

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	/* sanity check */
	pmixp_coll_sanity_check(coll);

	/* setup callback info */
	coll->cbfunc = cbfunc;
	coll->cbdata = cbdata;

	coll_ctx = pmixp_coll_ring_ctx_new(coll);
	if (!coll_ctx) {
		PMIXP_ERROR("Can not get new ring collective context, seq=%u",
			    coll->seq);
		ret = SLURM_ERROR;
		goto exit;
	}

#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: contrib/loc: seqnum=%u, state=%d, size=%lu",
		    coll_ctx, coll_ctx->seq, coll_ctx->state, size);
#endif

	if (_pmixp_coll_contrib(coll_ctx, coll->my_peerid, 0, data, size)) {
		goto exit;
	}

	/* mark local contribution */
	coll_ctx->contrib_local = true;
	_progress_coll_ring(coll_ctx);

exit:
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	return ret;
}
Esempio n. 3
0
static void _libpmix_cb(void *_vcbdata)
{
	pmixp_coll_ring_cbdata_t *cbdata = (pmixp_coll_ring_cbdata_t*)_vcbdata;
	pmixp_coll_t *coll = cbdata->coll;
	Buf buf = cbdata->buf;

	pmixp_coll_sanity_check(coll);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	/* reset buf */
	buf->processed = 0;
	/* push it back to pool for reuse */
	list_push(coll->state.ring.ring_buf_pool, buf);

	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	xfree(cbdata);
}
Esempio n. 4
0
void pmixp_coll_free(pmixp_coll_t *coll)
{
	pmixp_coll_sanity_check(coll);

	if (NULL != coll->pset.procs) {
		xfree(coll->pset.procs);
	}
#ifdef PMIXP_COLL_DEBUG
	hostlist_destroy(coll->peers_hl);
#endif
	/* check for collective in a not-SYNC state - something went wrong */
	switch(coll->type) {
	case PMIXP_COLL_TYPE_FENCE_TREE:
		if (PMIXP_COLL_TREE_SYNC != coll->state.tree.state)
			pmixp_coll_log(coll);

		pmixp_coll_tree_free(&coll->state.tree);
		break;
	case PMIXP_COLL_TYPE_FENCE_RING:
	{
		int i, ctx_in_use = 0;
		for (i = 0; i < PMIXP_COLL_RING_CTX_NUM; i++) {
			pmixp_coll_ring_ctx_t *coll_ctx =
				&coll->state.ring.ctx_array[i];
			if (coll_ctx->in_use)
				ctx_in_use++;
		}
		if (ctx_in_use)
			pmixp_coll_log(coll);
		pmixp_coll_ring_free(&coll->state.ring);
		break;
	}
	default:
		PMIXP_ERROR("Unknown coll type");
		break;
	}
	xfree(coll);
}
Esempio n. 5
0
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size)
{
	PMIXP_DEBUG("%s:%d: get local contribution", pmixp_info_namespace(),
			pmixp_info_nodeid());

	/* sanity check */
	pmixp_coll_sanity_check(coll);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	/* change the collective state if need */
	if (PMIXP_COLL_SYNC == coll->state) {
		PMIXP_DEBUG(
				"%s:%d: get local contribution: switch to PMIXP_COLL_FAN_IN",
				pmixp_info_namespace(), pmixp_info_nodeid());
		coll->state = PMIXP_COLL_FAN_IN;
		coll->ts = time(NULL);
	}
	xassert(PMIXP_COLL_FAN_IN == coll->state);

	/* save & mark local contribution */
	coll->contrib_local = true;
	grow_buf(coll->buf, size);
	memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size);
	set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size);

	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	/* check if the collective is ready to progress */
	_progress_fan_in(coll);

	PMIXP_DEBUG("%s:%d: get local contribution: finish",
			pmixp_info_namespace(), pmixp_info_nodeid());

	return SLURM_SUCCESS;
}
Esempio n. 6
0
void _progres_fan_out(pmixp_coll_t *coll, Buf buf)
{
	PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid());

	pmixp_coll_sanity_check(coll);

	xassert(PMIXP_COLL_FAN_OUT == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state);

	/* update the database */
	if (NULL != coll->cbfunc) {
		void *data = get_buf_data(buf) + get_buf_offset(buf);
		size_t size = remaining_buf(buf);
		PMIXP_DEBUG("%s:%d: use the callback", pmixp_info_namespace(),
				pmixp_info_nodeid());
		coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata,
				pmixp_free_Buf, (void *)buf);
	}
	/* Prepare for the next collective operation */
	_fan_out_finished(coll);

	PMIXP_DEBUG("%s:%d: collective is prepared for the next use",
			pmixp_info_namespace(), pmixp_info_nodeid());
}
Esempio n. 7
0
static void _ring_sent_cb(int rc, pmixp_p2p_ctx_t ctx, void *_cbdata)
{
	pmixp_coll_ring_cbdata_t *cbdata = (pmixp_coll_ring_cbdata_t*)_cbdata;
	pmixp_coll_ring_ctx_t *coll_ctx = cbdata->coll_ctx;
	pmixp_coll_t *coll = cbdata->coll;
	Buf buf = cbdata->buf;

	pmixp_coll_sanity_check(coll);

	if (PMIXP_P2P_REGULAR == ctx) {
		/* lock the collective */
		slurm_mutex_lock(&coll->lock);
	}
#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: called %d", coll_ctx, coll_ctx->seq);
#endif
	if (cbdata->seq != coll_ctx->seq) {
		/* it seems like this collective was reset since the time
		 * we initiated this send.
		 * Just exit to avoid data corruption.
		 */
		PMIXP_DEBUG("%p: collective was reset!", coll_ctx);
		goto exit;
	}
	coll_ctx->forward_cnt++;
	_progress_coll_ring(coll_ctx);

exit:
	pmixp_server_buf_reset(buf);
	list_push(coll->state.ring.fwrd_buf_pool, buf);

	if (PMIXP_P2P_REGULAR == ctx) {
		/* unlock the collective */
		slurm_mutex_unlock(&coll->lock);
	}
	xfree(cbdata);
}
Esempio n. 8
0
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size,
			     pmix_modex_cbfunc_t cbfunc, void *cbdata)
{
	int ret = SLURM_SUCCESS;

	pmixp_debug_hang(0);

	/* sanity check */
	pmixp_coll_sanity_check(coll);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: contrib/loc: seqnum=%u, state=%s, size=%zd",
		    coll, coll->seq, pmixp_coll_state2str(coll->state), size);
#endif

	switch (coll->state) {
	case PMIXP_COLL_SYNC:
		/* change the state */
		coll->ts = time(NULL);
		/* fall-thru */
	case PMIXP_COLL_COLLECT:
		/* sanity check */
		break;
	case PMIXP_COLL_DOWNFWD:
		/* We are waiting for some send requests
		 * to be finished, but local node has started
		 * the next contribution.
		 * This is an OK situation, go ahead and store
		 * it, the buffer with the contribution is not used
		 * now.
		 */
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: contrib/loc: next coll!", coll);
#endif
		break;
	case PMIXP_COLL_UPFWD:
	case PMIXP_COLL_UPFWD_WSC:
	case PMIXP_COLL_UPFWD_WPC:
		/* this is not a correct behavior, respond with an error. */
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: contrib/loc: before prev coll is finished!",
			    coll);
#endif
		ret = SLURM_ERROR;
		goto exit;
	default:
		/* FATAL: should not happen in normal workflow */
		PMIXP_ERROR("%p: local contrib while active collective, "
			    "state = %s",
			    coll, pmixp_coll_state2str(coll->state));
		xassert(0);
		abort();
	}

	if (coll->contrib_local) {
		/* Double contribution - reject */
		ret = SLURM_ERROR;
		goto exit;
	}

	/* save & mark local contribution */
	coll->contrib_local = true;
	pmixp_server_buf_reserve(coll->ufwd_buf, size);
	memcpy(get_buf_data(coll->ufwd_buf) + get_buf_offset(coll->ufwd_buf),
	       data, size);
	set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size);

	/* setup callback info */
	coll->cbfunc = cbfunc;
	coll->cbdata = cbdata;

	/* check if the collective is ready to progress */
	_progress_coll(coll);

#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: finish, state=%s",
		    coll, pmixp_coll_state2str(coll->state));
#endif

exit:
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);
	return ret;
}
Esempio n. 9
0
static int _progress_collect(pmixp_coll_t *coll)
{
	pmixp_ep_t ep = {0};
	int rc;

	xassert(PMIXP_COLL_COLLECT == coll->state);

	ep.type = PMIXP_EP_NONE;
#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: state=%s, local=%d, child_cntr=%d",
		    coll, pmixp_coll_state2str(coll->state),
		    (int)coll->contrib_local, coll->contrib_children);
#endif
	/* lock the collective */
	pmixp_coll_sanity_check(coll);

	if (PMIXP_COLL_COLLECT != coll->state) {
		/* In case of race condition between libpmix and
		 * slurm threads we can be called
		 * after we moved to the next step. */
		return 0;
	}

	if (!coll->contrib_local ||
	    coll->contrib_children != coll->chldrn_cnt) {
		/* Not yet ready to go to the next step */
		return 0;
	}

	if (pmixp_info_srv_direct_conn()) {
		/* We will need to forward aggregated
		 * message back to our children */
		coll->state = PMIXP_COLL_UPFWD;
	} else {
		/* If we use SLURM API (SAPI) - intermediate nodes
		 * don't need to forward data as the root will do
		 * SAPI broadcast.
		 * So, only root has to go through the full UPFWD
		 * state and send the message back.
		 * Other procs have to go through other route. The reason for
		 * that is the fact that som of out children can receive bcast
		 * message early and initiate next collective. We need to handle
		 * that properly.
		 */
		if (0 > coll->prnt_peerid) {
			coll->state = PMIXP_COLL_UPFWD;
		} else {
			coll->state = PMIXP_COLL_UPFWD_WSC;
		}
	}

	/* The root of the collective will have parent_host == NULL */
	if (NULL != coll->prnt_host) {
		ep.type = PMIXP_EP_NOIDEID;
		ep.ep.nodeid = coll->prnt_peerid;
		coll->ufwd_status = PMIXP_COLL_SND_ACTIVE;
		PMIXP_DEBUG("%p: send data to %s:%d",
			    coll, coll->prnt_host, coll->prnt_peerid);
	} else {
		/* move data from input buffer to the output */
		char *dst, *src = get_buf_data(coll->ufwd_buf) +
				coll->ufwd_offset;
		size_t size = get_buf_offset(coll->ufwd_buf) -
				coll->ufwd_offset;
		pmixp_server_buf_reserve(coll->dfwd_buf, size);
		dst = get_buf_data(coll->dfwd_buf) + coll->dfwd_offset;
		memcpy(dst, src, size);
		set_buf_offset(coll->dfwd_buf, coll->dfwd_offset + size);
		/* no need to send */
		coll->ufwd_status = PMIXP_COLL_SND_DONE;
		/* this is root */
		coll->contrib_prnt = true;
	}

	if (PMIXP_EP_NONE != ep.type) {
		pmixp_coll_cbdata_t *cbdata;
		cbdata = xmalloc(sizeof(pmixp_coll_cbdata_t));
		cbdata->coll = coll;
		cbdata->seq = coll->seq;
		cbdata->refcntr = 1;
		char *nodename = coll->prnt_host;
		rc = pmixp_server_send_nb(&ep, PMIXP_MSG_FAN_IN, coll->seq,
					  coll->ufwd_buf,
					  _ufwd_sent_cb, cbdata);

		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR("Cannot send data (size = %lu), "
				    "to %s:%d",
				    (uint64_t) get_buf_offset(coll->ufwd_buf),
				    nodename, ep.ep.nodeid);
			coll->ufwd_status = PMIXP_COLL_SND_FAILED;
		}
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: fwd to %s:%d, size = %lu",
			    coll, nodename, ep.ep.nodeid,
			    (uint64_t) get_buf_offset(coll->dfwd_buf));
#endif
	}

	/* events observed - need another iteration */
	return true;
}
Esempio n. 10
0
/*
 * use it for internal collective
 * performance evaluation tool.
 */
pmixp_coll_t *pmixp_coll_from_cbdata(void *cbdata)
{
	pmixp_coll_cbdata_t *ptr = (pmixp_coll_cbdata_t*)cbdata;
	pmixp_coll_sanity_check(ptr->coll);
	return ptr->coll;
}
Esempio n. 11
0
int pmixp_coll_contrib_parent(pmixp_coll_t *coll, uint32_t peerid,
			     uint32_t seq, Buf buf)
{
#ifdef PMIXP_COLL_DEBUG
	char *nodename = NULL;
	int lpeerid = -1;
#endif
	char *data_src = NULL, *data_dst = NULL;
	uint32_t size;
	int expected_peerid;

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	if (pmixp_info_srv_direct_conn()) {
		expected_peerid = coll->prnt_peerid;
	} else {
		expected_peerid = coll->root_peerid;
	}

	/* Sanity check */
	pmixp_coll_sanity_check(coll);
	if (expected_peerid != peerid) {
		char *nodename = pmixp_info_job_host(peerid);
		/* protect ourselfs if we are running with no asserts */
		PMIXP_ERROR("%p: parent contrib from bad nodeid=%s:%u, "
			    "expect=%d",
			    coll, nodename, peerid, expected_peerid);
		xfree(nodename);
		goto proceed;
	}

#ifdef PMIXP_COLL_DEBUG
	nodename = pmixp_info_job_host(peerid);
	lpeerid = hostlist_find(coll->peers_hl, nodename);
	/* Mark this event */
	PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d): state=%s, size=%u",
		    coll, nodename, peerid, lpeerid,
		    pmixp_coll_state2str(coll->state), remaining_buf(buf));
#endif

	switch (coll->state) {
	case PMIXP_COLL_SYNC:
	case PMIXP_COLL_COLLECT:
		/* It looks like a retransmission attempt when remote side
		 * identified transmission failure, but we actually successfuly
		 * received the message */
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: prev contrib from %s:%d(%d): "
			    "seq=%u, cur_seq=%u, state=%s",
			    coll, nodename, peerid, lpeerid,
			    seq, coll->seq,
			    pmixp_coll_state2str(coll->state));
#endif
		/* sanity check */
		if ((coll->seq - 1) != seq) {
			/* FATAL: should not happen in normal workflow */
			char *nodename = pmixp_info_job_host(peerid);
			PMIXP_ERROR("%p: unexpected contrib from %s:%d: "
				    "contrib_seq = %d, coll->seq = %d, "
				    "state=%s",
				    coll, nodename, peerid,
				    seq, coll->seq,
				    pmixp_coll_state2str(coll->state));
			xfree(nodename);
			xassert((coll->seq - 1) == seq);
			abort();
		}
		goto proceed;
	case PMIXP_COLL_UPFWD_WSC:{
		/* we are not actually ready to receive this contribution as
		 * the upward portion of the collective wasn't received yet.
		 * This should not happen as SAPI (SLURM API) is blocking and
		 * we chould transit to PMIXP_COLL_UPFWD_WPC immediately */
		/* FATAL: should not happen in normal workflow */
		char *nodename = pmixp_info_job_host(peerid);
		PMIXP_ERROR("%p: unexpected contrib from %s:%d: "
			    "contrib_seq = %d, coll->seq = %d, "
			    "state=%s",
			    coll, nodename, peerid,
			    seq, coll->seq,
			    pmixp_coll_state2str(coll->state));
		xfree(nodename);
		xassert((coll->seq - 1) == seq);
		abort();
	}
	case PMIXP_COLL_UPFWD:
	case PMIXP_COLL_UPFWD_WPC:
		/* we were waiting for this */
		break;
	case PMIXP_COLL_DOWNFWD:
		/* It looks like a retransmission attempt when remote side
		 * identified transmission failure, but we actually successfuly
		 * received the message */
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: double contrib from %s:%d(%d) "
			    "seq=%u, cur_seq=%u, state=%s",
			    coll, nodename, peerid, lpeerid,
			    seq, coll->seq, pmixp_coll_state2str(coll->state));
#endif
		/* sanity check */
		if (coll->seq != seq) {
			char *nodename = pmixp_info_job_host(peerid);
			/* FATAL: should not happen in normal workflow */
			PMIXP_ERROR("%p: unexpected contrib from %s:%d: "
				    "seq = %d, coll->seq = %d, state=%s",
				    coll, nodename, peerid,
				    seq, coll->seq,
				    pmixp_coll_state2str(coll->state));
			xassert((coll->seq - 1) == seq);
			xfree(nodename);
			abort();
		}
		goto proceed;
	default:
		/* should not happen in normal workflow */
		PMIXP_ERROR("%p: unknown collective state %s",
			    coll, pmixp_coll_state2str(coll->state));
		abort();
	}

	/* Because of possible timeouts/delays in transmission we
	 * can receive a contribution second time. Avoid duplications
	 * by checking our records. */
	if (coll->contrib_prnt) {
		char *nodename = pmixp_info_job_host(peerid);
		/* May be 0 or 1. If grater - transmission skew, ignore.
		 * NOTE: this output is not on the critical path -
		 * don't preprocess it out */
		PMIXP_DEBUG("%p: multiple contributions from parent %s:%d",
			    coll, nodename, peerid);
		xfree(nodename);
		/* this is duplication, skip. */
		goto proceed;
	}
	coll->contrib_prnt = true;

	data_src = get_buf_data(buf) + get_buf_offset(buf);
	size = remaining_buf(buf);
	pmixp_server_buf_reserve(coll->dfwd_buf, size);

	data_dst = get_buf_data(coll->dfwd_buf) +
			get_buf_offset(coll->dfwd_buf);
	memcpy(data_dst, data_src, size);
	set_buf_offset(coll->dfwd_buf,
		       get_buf_offset(coll->dfwd_buf) + size);
proceed:
	_progress_coll(coll);

#ifdef PMIXP_COLL_DEBUG
	if (nodename) {
		PMIXP_DEBUG("%p: finish: node=%s:%d(%d), state=%s",
			    coll, nodename, peerid, lpeerid,
			    pmixp_coll_state2str(coll->state));
		xfree(nodename);
	}
#endif
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	return SLURM_SUCCESS;
}
Esempio n. 12
0
int pmixp_coll_contrib_child(pmixp_coll_t *coll, uint32_t peerid,
			     uint32_t seq, Buf buf)
{
	char *data_src = NULL, *data_dst = NULL;
	uint32_t size;
	int chld_id;

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);
	pmixp_coll_sanity_check(coll);
	if (0 > (chld_id = _chld_id(coll, peerid))) {
		char *nodename = pmixp_info_job_host(peerid);
		char *avail_ids = _chld_ids_str(coll);
		PMIXP_DEBUG("%p: contribution from the non-child node "
			    "%s:%d, acceptable ids: %s",
			    coll, nodename, peerid, avail_ids);
		xfree(nodename);
		xfree(avail_ids);
	}

#ifdef PMIXP_COLL_DEBUG
	char *nodename = pmixp_info_job_host(peerid);
	int lpeerid = hostlist_find(coll->peers_hl, nodename);
	PMIXP_DEBUG("%p: contrib/rem from %s:%d(%d:%d):, state=%s, size=%u",
		    coll, nodename, peerid, lpeerid, chld_id,
		    pmixp_coll_state2str(coll->state),
		    remaining_buf(buf));
#endif

	switch (coll->state) {
	case PMIXP_COLL_SYNC:
		/* change the state */
		coll->ts = time(NULL);
		/* fall-thru */
	case PMIXP_COLL_COLLECT:
		/* sanity check */
		if (coll->seq != seq) {
			char *nodename = pmixp_info_job_host(peerid);
			/* FATAL: should not happen in normal workflow */
			PMIXP_ERROR("%p: unexpected contrib from %s:%d "
				    "(child #%d) seq = %d, coll->seq = %d, "
				    "state=%s",
				    coll, nodename, peerid, chld_id,
				    seq, coll->seq,
				    pmixp_coll_state2str(coll->state));
			xassert(coll->seq == seq);
			abort();
		}
		break;
	case PMIXP_COLL_UPFWD:
	case PMIXP_COLL_UPFWD_WSC:
		/* FATAL: should not happen in normal workflow */
		PMIXP_ERROR("%p: unexpected contrib from %s:%d, state = %s",
			    coll, nodename, peerid,
			    pmixp_coll_state2str(coll->state));
		xassert(0);
		abort();
	case PMIXP_COLL_UPFWD_WPC:
	case PMIXP_COLL_DOWNFWD:
#ifdef PMIXP_COLL_DEBUG
		/* It looks like a retransmission attempt when remote side
		 * identified transmission failure, but we actually successfuly
		 * received the message */
		PMIXP_DEBUG("%p: contrib for the next collective "
			    "from=%s:%d(%d:%d) contrib_seq=%u, coll->seq=%u, "
			    "state=%s",
			    coll, nodename, peerid, lpeerid, chld_id,
			    seq, coll->seq, pmixp_coll_state2str(coll->state));
#endif
		if ((coll->seq +1) != seq) {
			char *nodename = pmixp_info_job_host(peerid);
			/* should not happen in normal workflow */
			PMIXP_ERROR("%p: unexpected contrib from %s:%d(x:%d) "
				    "seq = %d, coll->seq = %d, "
				    "state=%s",
				    coll, nodename, peerid, chld_id,
				    seq, coll->seq,
				    pmixp_coll_state2str(coll->state));
			xfree(nodename);
			xassert((coll->seq +1) == seq);
			abort();
		}
		break;
	default:
		/* should not happen in normal workflow */
		PMIXP_ERROR("%p: unknown collective state %s",
			    coll, pmixp_coll_state2str(coll->state));
		abort();
	}

	/* Because of possible timeouts/delays in transmission we
	 * can receive a contribution second time. Avoid duplications
	 * by checking our records. */
	if (coll->contrib_chld[chld_id]) {
		char *nodename = pmixp_info_job_host(peerid);
		/* May be 0 or 1. If grater - transmission skew, ignore.
		 * NOTE: this output is not on the critical path -
		 * don't preprocess it out */
		PMIXP_DEBUG("%p: multiple contribs from %s:%d(x:%d)",
			    coll, nodename, peerid, chld_id);
		/* this is duplication, skip. */
		xfree(nodename);
		goto proceed;
	}

	data_src = get_buf_data(buf) + get_buf_offset(buf);
	size = remaining_buf(buf);
	pmixp_server_buf_reserve(coll->ufwd_buf, size);
	data_dst = get_buf_data(coll->ufwd_buf) +
			get_buf_offset(coll->ufwd_buf);
	memcpy(data_dst, data_src, size);
	set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size);

	/* increase number of individual contributions */
	coll->contrib_chld[chld_id] = true;
	/* increase number of total contributions */
	coll->contrib_children++;

proceed:
	_progress_coll(coll);

#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: finish: node=%s:%d(%d:%d), state=%s",
		    coll, nodename, peerid, lpeerid, chld_id,
		    pmixp_coll_state2str(coll->state));
	xfree(nodename);
#endif
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	return SLURM_SUCCESS;
}
Esempio n. 13
0
static void _progress_fan_in(pmixp_coll_t *coll)
{
	pmixp_srv_cmd_t type;
	const char *addr = pmixp_info_srv_addr();
	char *hostlist = NULL;
	int rc, is_p2p = 0;
	Buf root_buf;

	PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d",
			pmixp_info_namespace(), pmixp_info_nodeid(),
			coll->contrib_local, coll->contrib_cntr);

	/* lock the collective */
	slurm_mutex_lock(&coll->lock);

	pmixp_coll_sanity_check(coll);

	if (PMIXP_COLL_FAN_IN != coll->state) {
		/* In case of race condition between libpmix and
		 * slurm threads progress_fan_in can be called
		 * after we moved to the next step. */
		goto unlock;
	}

	if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) {
		/* Not yet ready to go to the next step */
		goto unlock;
	}

	/* The root of the collective will have parent_host == NULL */
	if (NULL != coll->parent_host) {
		hostlist = xstrdup(coll->parent_host);
		type = PMIXP_MSG_FAN_IN;
		PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state",
			    pmixp_info_namespace(), pmixp_info_nodeid());
		is_p2p = 1;
	} else {
		if (0 < hostlist_count(coll->all_children)) {
			hostlist = hostlist_ranged_string_xmalloc(
					coll->all_children);
			type = PMIXP_MSG_FAN_OUT;
			pmixp_debug_hang(0);
		}
		rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf);
		xassert(0 == rc);
		PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)",
			    pmixp_info_namespace(), pmixp_info_nodeid());
	}

	PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(),
			pmixp_info_nodeid(), hostlist);

	/* Check for the singletone case */
	if (NULL != hostlist) {
		if( 0 == coll->seq && NULL != coll->parent_host ){
			/* This is the first message sent to the parent.
			 * There might be a race condition where parent
			 * is not ready to receive the messages.
			 * Use zero-size message to check parent status first
			 * and then send the full message.
			 */
			pmixp_server_health_chk(hostlist, addr);
		}
		rc = pmixp_server_send(hostlist, type, coll->seq, addr,
				get_buf_data(coll->buf),
				get_buf_offset(coll->buf), is_p2p);

		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR(
					"Cannot send data (size = %lu), to hostlist:\n%s",
					(uint64_t) get_buf_offset(coll->buf),
					hostlist);
			/* return error indication to PMIx. Nodes that haven't received data
			 * will exit by a timeout.
			 * FIXME: do we need to do something with successfuly finished nodes?
			 */
			goto unlock;
		}
	}

	/* transit to the next state */
	_fan_in_finished(coll);

	/* if we are root - push data to PMIx here.
	 * Originally there was a homogenuous solution: root nodename was in the hostlist.
	 * However this may lead to the undesired side effects: we are blocked here sending
	 * data and cannot receive (it will be triggered in this thread after we will leave
	 * this callback), so we have to rely on buffering on the SLURM side.
	 * Better not to do so. */
	if (NULL == coll->parent_host) {
		/* if I am the root - pass the data to PMIx and reset collective here */
		/* copy payload excluding reserved server header */
		_progres_fan_out(coll, root_buf);
	}

unlock:
	if (NULL != hostlist) {
		xfree(hostlist);
	}

	/* lock the */
	slurm_mutex_unlock(&coll->lock);
}
Esempio n. 14
0
int pmixp_coll_contrib_node(pmixp_coll_t *coll, char *nodename, Buf buf)
{
	int nodeid;
	char *data = NULL;
	uint32_t size;
	char *state = NULL;

	PMIXP_DEBUG("%s:%d: get contribution from node %s",
			pmixp_info_namespace(), pmixp_info_nodeid(), nodename);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	pmixp_coll_sanity_check(coll);

	/* fix the collective status if need */
	if (PMIXP_COLL_SYNC == coll->state) {
		PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_IN",
			    pmixp_info_namespace(), pmixp_info_nodeid(),
			    nodename);
		coll->state = PMIXP_COLL_FAN_IN;
		coll->ts = time(NULL);
	} else if( PMIXP_COLL_FAN_OUT == coll->state) {
		PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_OUT_IN"
			    " (next collective!)",
			    pmixp_info_namespace(), pmixp_info_nodeid(),
			    nodename);
		coll->state = PMIXP_COLL_FAN_OUT_IN;
		coll->ts_next = time(NULL);
	}
	xassert(PMIXP_COLL_FAN_IN == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state);

	/* Because of possible timeouts/delays in transmission we
	 * can receive a contribution second time. Avoid duplications
	 * by checking our records. */
	nodeid = hostlist_find(coll->ch_hosts, nodename);
	xassert(0 <= nodeid);
	if (0 > nodeid) {
		/* protect ourselfs if we are running with no asserts */
		goto proceed;
	}

	if (0 < coll->ch_contribs[nodeid]) {
		/* May be 0 or 1. If grater - transmission skew, ignore. */
		PMIXP_DEBUG("Multiple contributions from child_id=%d, hostname=%s",
			    nodeid, nodename);
		/* this is duplication, skip. */
		goto proceed;
	}

	data = get_buf_data(buf) + get_buf_offset(buf);
	size = remaining_buf(buf);
	grow_buf(coll->buf, size);
	memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size);
	set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size);

	/* increase number of individual contributions */
	coll->ch_contribs[nodeid]++;

	/* increase number of total contributions */
	coll->contrib_cntr++;

proceed:
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	if( PMIXP_COLL_FAN_IN == coll->state ){
		/* make a progress if we are in fan-in state */
		_progress_fan_in(coll);
	}

	switch( coll->state ){
	case PMIXP_COLL_SYNC:
		state = "sync";
		break;
	case PMIXP_COLL_FAN_IN:
		state = "fan-in";
		break;
	case PMIXP_COLL_FAN_OUT:
		state = "fan-out";
		break;
	case PMIXP_COLL_FAN_OUT_IN:
		state = "fan-out-in";
		break;
	}

	PMIXP_DEBUG("%s:%d: get contribution from node %s: finish. State = %s",
		    pmixp_info_namespace(), pmixp_info_nodeid(), nodename,
		    state);

	return SLURM_SUCCESS;
}