Exemple #1
0
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size)
{
	PMIXP_DEBUG("%s:%d: get local contribution", pmixp_info_namespace(),
			pmixp_info_nodeid());

	/* sanity check */
	pmixp_coll_sanity_check(coll);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	/* change the collective state if need */
	if (PMIXP_COLL_SYNC == coll->state) {
		PMIXP_DEBUG(
				"%s:%d: get local contribution: switch to PMIXP_COLL_FAN_IN",
				pmixp_info_namespace(), pmixp_info_nodeid());
		coll->state = PMIXP_COLL_FAN_IN;
		coll->ts = time(NULL);
	}
	xassert(PMIXP_COLL_FAN_IN == coll->state);

	/* save & mark local contribution */
	coll->contrib_local = true;
	grow_buf(coll->buf, size);
	memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size);
	set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size);

	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	/* check if the collective is ready to progress */
	_progress_fan_in(coll);

	PMIXP_DEBUG("%s:%d: get local contribution: finish",
			pmixp_info_namespace(), pmixp_info_nodeid());

	return SLURM_SUCCESS;
}
Exemple #2
0
void _progres_fan_out(pmixp_coll_t *coll, Buf buf)
{
	PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid());

	pmixp_coll_sanity_check(coll);

	xassert(PMIXP_COLL_FAN_OUT == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state);

	/* update the database */
	if (NULL != coll->cbfunc) {
		void *data = get_buf_data(buf) + get_buf_offset(buf);
		size_t size = remaining_buf(buf);
		PMIXP_DEBUG("%s:%d: use the callback", pmixp_info_namespace(),
				pmixp_info_nodeid());
		coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata,
				pmixp_free_Buf, (void *)buf);
	}
	/* Prepare for the next collective operation */
	_fan_out_finished(coll);

	PMIXP_DEBUG("%s:%d: collective is prepared for the next use",
			pmixp_info_namespace(), pmixp_info_nodeid());
}
Exemple #3
0
void pmixp_coll_bcast(pmixp_coll_t *coll, Buf buf)
{
	PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid());

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	_progres_fan_out(coll, buf);

	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	/* We may already start next collective. Try to progress!
	 * its OK if we in SYNC - there will be no-op */
	_progress_fan_in(coll);
}
Exemple #4
0
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf)
{
	int rc;

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmixp_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;
		int c_nodeid;

		rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid,
					    &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad message header from node %s",
				    nodename);
			xfree(nodename);
			goto exit;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from nodeid = %u, "
			    "type = %s, seq = %d",
			    hdr->nodeid,
			    ((PMIXP_MSG_FAN_IN == hdr->type) ?
				     "fan-in" : "fan-out"),
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad collective seq. #%d from %s, current"
				    " is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(),
					    pmixp_info_stepid(), SIGKILL);
			xfree(nodename);
			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from"
				    " nodeid %u, current is %d, skip "
				    "this message",
				    hdr->seq, hdr->nodeid, coll->seq);
			goto exit;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_child(coll, hdr->nodeid,
						 hdr->seq, buf);
		} else {
			pmixp_coll_contrib_parent(coll, hdr->nodeid,
						  hdr->seq, buf);
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq);
		/* buf will be free'd by the PMIx callback so
		 * protect the data by voiding the buffer.
		 * Use the statement below instead of (buf = NULL)
		 * to maintain incapsulation - in general `buf`is
		 * not a pointer, but opaque type.
		 */
		buf = create_buf(NULL, 0);
		break;
	}
	case PMIXP_MSG_INIT_DIRECT:
		PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid);
		break;
#ifndef NDEBUG
	case PMIXP_MSG_PINGPONG: {
		/* if the pingpong mode was activated -
		 * node 0 sends ping requests
		 * and receiver assumed to respond back to node 0
		 */
		int msize = remaining_buf(buf);

		if (pmixp_info_nodeid()) {
			pmixp_server_pp_send(0, msize);
		} else {
			if (pmixp_server_pp_same_thread()) {
				if (pmixp_server_pp_count() ==
				    pmixp_server_pp_warmups()) {
					pmixp_server_pp_start();
				}
				if (!pmixp_server_pp_check_fini(msize)) {
					pmixp_server_pp_send(1, msize);
				}
			}
		}
		pmixp_server_pp_inc();
		break;
	}
#endif
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}

exit:
	free_buf(buf);
}
Exemple #5
0
/*
 * For this to work the following conditions supposed to be
 * satisfied:
 * - SLURM has to be configured with `--enable-debug` option
 * - jobstep needs to have at least two nodes
 * In this case communication exchange will be done between
 * the first two nodes.
 */
void pmixp_server_run_pp(void)
{
	int i;
	size_t start, end, bound;
	/* ping is initiated by the nodeid == 0
	 * all the rest - just exit
	 */
	if (pmixp_info_nodeid()) {
		return;
	}


	start = 1 << _pmixp_pp_low;
	end = 1 << _pmixp_pp_up;
	bound = 1 << _pmixp_pp_bound;

	for (i = start; i <= end; i *= 2) {
		int count, iters = _pmixp_pp_siter;
		struct timeval tv1, tv2;
		double time;
		if (i >= bound) {
			iters = _pmixp_pp_liter;
		}

		if (!_pmixp_pp_same_thr) {
			/* warmup - 10% of iters # */
			count = pmixp_server_pp_count() + iters/10;
			while (pmixp_server_pp_count() < count) {
				int cur_count = pmixp_server_pp_count();
				pmixp_server_pp_send(1, i);
				while (cur_count == pmixp_server_pp_count()) {
					usleep(1);
				}
			}

			count = pmixp_server_pp_count() + iters;
			gettimeofday(&tv1, NULL);
			while (pmixp_server_pp_count() < count) {
				int cur_count = pmixp_server_pp_count();
				/* Send the message to the (nodeid == 1) */
				pmixp_server_pp_send(1, i);
				/* wait for the response */
				while (cur_count == pmixp_server_pp_count());
			}
			gettimeofday(&tv2, NULL);
			time = tv2.tv_sec + 1E-6 * tv2.tv_usec -
					(tv1.tv_sec + 1E-6 * tv1.tv_usec);
			/* Output measurements to the slurmd.log */
			PMIXP_ERROR("latency: %d - %.9lf", i, time / iters );
		} else {
			int count = iters + iters/10;

			slurm_mutex_lock(&_pmixp_pp_lock);
			_pmixp_pp_warmup = iters/10;
			_pmixp_pp_iters = iters;
			_pmixp_pp_count = 0;
			slurm_mutex_unlock(&_pmixp_pp_lock);
			/* initiate sends */
			pmixp_server_pp_send(1, i);
			while (pmixp_server_pp_count() < count){
				sched_yield();
			}
		}
	}
}
Exemple #6
0
static void _progress_fan_in(pmixp_coll_t *coll)
{
	pmixp_srv_cmd_t type;
	const char *addr = pmixp_info_srv_addr();
	char *hostlist = NULL;
	int rc, is_p2p = 0;
	Buf root_buf;

	PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d",
			pmixp_info_namespace(), pmixp_info_nodeid(),
			coll->contrib_local, coll->contrib_cntr);

	/* lock the collective */
	slurm_mutex_lock(&coll->lock);

	pmixp_coll_sanity_check(coll);

	if (PMIXP_COLL_FAN_IN != coll->state) {
		/* In case of race condition between libpmix and
		 * slurm threads progress_fan_in can be called
		 * after we moved to the next step. */
		goto unlock;
	}

	if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) {
		/* Not yet ready to go to the next step */
		goto unlock;
	}

	/* The root of the collective will have parent_host == NULL */
	if (NULL != coll->parent_host) {
		hostlist = xstrdup(coll->parent_host);
		type = PMIXP_MSG_FAN_IN;
		PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state",
			    pmixp_info_namespace(), pmixp_info_nodeid());
		is_p2p = 1;
	} else {
		if (0 < hostlist_count(coll->all_children)) {
			hostlist = hostlist_ranged_string_xmalloc(
					coll->all_children);
			type = PMIXP_MSG_FAN_OUT;
			pmixp_debug_hang(0);
		}
		rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf);
		xassert(0 == rc);
		PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)",
			    pmixp_info_namespace(), pmixp_info_nodeid());
	}

	PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(),
			pmixp_info_nodeid(), hostlist);

	/* Check for the singletone case */
	if (NULL != hostlist) {
		if( 0 == coll->seq && NULL != coll->parent_host ){
			/* This is the first message sent to the parent.
			 * There might be a race condition where parent
			 * is not ready to receive the messages.
			 * Use zero-size message to check parent status first
			 * and then send the full message.
			 */
			pmixp_server_health_chk(hostlist, addr);
		}
		rc = pmixp_server_send(hostlist, type, coll->seq, addr,
				get_buf_data(coll->buf),
				get_buf_offset(coll->buf), is_p2p);

		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR(
					"Cannot send data (size = %lu), to hostlist:\n%s",
					(uint64_t) get_buf_offset(coll->buf),
					hostlist);
			/* return error indication to PMIx. Nodes that haven't received data
			 * will exit by a timeout.
			 * FIXME: do we need to do something with successfuly finished nodes?
			 */
			goto unlock;
		}
	}

	/* transit to the next state */
	_fan_in_finished(coll);

	/* if we are root - push data to PMIx here.
	 * Originally there was a homogenuous solution: root nodename was in the hostlist.
	 * However this may lead to the undesired side effects: we are blocked here sending
	 * data and cannot receive (it will be triggered in this thread after we will leave
	 * this callback), so we have to rely on buffering on the SLURM side.
	 * Better not to do so. */
	if (NULL == coll->parent_host) {
		/* if I am the root - pass the data to PMIx and reset collective here */
		/* copy payload excluding reserved server header */
		_progres_fan_out(coll, root_buf);
	}

unlock:
	if (NULL != hostlist) {
		xfree(hostlist);
	}

	/* lock the */
	slurm_mutex_unlock(&coll->lock);
}
Exemple #7
0
int pmixp_coll_contrib_node(pmixp_coll_t *coll, char *nodename, Buf buf)
{
	int nodeid;
	char *data = NULL;
	uint32_t size;
	char *state = NULL;

	PMIXP_DEBUG("%s:%d: get contribution from node %s",
			pmixp_info_namespace(), pmixp_info_nodeid(), nodename);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	pmixp_coll_sanity_check(coll);

	/* fix the collective status if need */
	if (PMIXP_COLL_SYNC == coll->state) {
		PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_IN",
			    pmixp_info_namespace(), pmixp_info_nodeid(),
			    nodename);
		coll->state = PMIXP_COLL_FAN_IN;
		coll->ts = time(NULL);
	} else if( PMIXP_COLL_FAN_OUT == coll->state) {
		PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_OUT_IN"
			    " (next collective!)",
			    pmixp_info_namespace(), pmixp_info_nodeid(),
			    nodename);
		coll->state = PMIXP_COLL_FAN_OUT_IN;
		coll->ts_next = time(NULL);
	}
	xassert(PMIXP_COLL_FAN_IN == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state);

	/* Because of possible timeouts/delays in transmission we
	 * can receive a contribution second time. Avoid duplications
	 * by checking our records. */
	nodeid = hostlist_find(coll->ch_hosts, nodename);
	xassert(0 <= nodeid);
	if (0 > nodeid) {
		/* protect ourselfs if we are running with no asserts */
		goto proceed;
	}

	if (0 < coll->ch_contribs[nodeid]) {
		/* May be 0 or 1. If grater - transmission skew, ignore. */
		PMIXP_DEBUG("Multiple contributions from child_id=%d, hostname=%s",
			    nodeid, nodename);
		/* this is duplication, skip. */
		goto proceed;
	}

	data = get_buf_data(buf) + get_buf_offset(buf);
	size = remaining_buf(buf);
	grow_buf(coll->buf, size);
	memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size);
	set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size);

	/* increase number of individual contributions */
	coll->ch_contribs[nodeid]++;

	/* increase number of total contributions */
	coll->contrib_cntr++;

proceed:
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	if( PMIXP_COLL_FAN_IN == coll->state ){
		/* make a progress if we are in fan-in state */
		_progress_fan_in(coll);
	}

	switch( coll->state ){
	case PMIXP_COLL_SYNC:
		state = "sync";
		break;
	case PMIXP_COLL_FAN_IN:
		state = "fan-in";
		break;
	case PMIXP_COLL_FAN_OUT:
		state = "fan-out";
		break;
	case PMIXP_COLL_FAN_OUT_IN:
		state = "fan-out-in";
		break;
	}

	PMIXP_DEBUG("%s:%d: get contribution from node %s: finish. State = %s",
		    pmixp_info_namespace(), pmixp_info_nodeid(), nodename,
		    state);

	return SLURM_SUCCESS;
}