Beispiel #1
0
int pmixp_stepd_finalize(void)
{
	char *path;
	if (!_was_initialized) {
		/* nothing to do */
		return 0;
	}

	pmixp_libpmix_finalize();
	pmixp_dmdx_finalize();

	pmixp_conn_fini();
	pmixp_dconn_fini();

	pmixp_state_finalize();
	pmixp_nspaces_finalize();

	/* cleanup the UNIX socket */
	PMIXP_DEBUG("Remove PMIx plugin usock");
	close(pmixp_info_srv_usock_fd());
	path = pmixp_info_nspace_usock(pmixp_info_namespace());
	unlink(path);
	xfree(path);

	/* free the information */
	pmixp_info_free();
	return SLURM_SUCCESS;
}
Beispiel #2
0
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size)
{
	PMIXP_DEBUG("%s:%d: get local contribution", pmixp_info_namespace(),
			pmixp_info_nodeid());

	/* sanity check */
	pmixp_coll_sanity_check(coll);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	/* change the collective state if need */
	if (PMIXP_COLL_SYNC == coll->state) {
		PMIXP_DEBUG(
				"%s:%d: get local contribution: switch to PMIXP_COLL_FAN_IN",
				pmixp_info_namespace(), pmixp_info_nodeid());
		coll->state = PMIXP_COLL_FAN_IN;
		coll->ts = time(NULL);
	}
	xassert(PMIXP_COLL_FAN_IN == coll->state);

	/* save & mark local contribution */
	coll->contrib_local = true;
	grow_buf(coll->buf, size);
	memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size);
	set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size);

	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	/* check if the collective is ready to progress */
	_progress_fan_in(coll);

	PMIXP_DEBUG("%s:%d: get local contribution: finish",
			pmixp_info_namespace(), pmixp_info_nodeid());

	return SLURM_SUCCESS;
}
Beispiel #3
0
void _progres_fan_out(pmixp_coll_t *coll, Buf buf)
{
	PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid());

	pmixp_coll_sanity_check(coll);

	xassert(PMIXP_COLL_FAN_OUT == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state);

	/* update the database */
	if (NULL != coll->cbfunc) {
		void *data = get_buf_data(buf) + get_buf_offset(buf);
		size_t size = remaining_buf(buf);
		PMIXP_DEBUG("%s:%d: use the callback", pmixp_info_namespace(),
				pmixp_info_nodeid());
		coll->cbfunc(PMIX_SUCCESS, data, size, coll->cbdata,
				pmixp_free_Buf, (void *)buf);
	}
	/* Prepare for the next collective operation */
	_fan_out_finished(coll);

	PMIXP_DEBUG("%s:%d: collective is prepared for the next use",
			pmixp_info_namespace(), pmixp_info_nodeid());
}
Beispiel #4
0
void pmixp_coll_bcast(pmixp_coll_t *coll, Buf buf)
{
	PMIXP_DEBUG("%s:%d: start", pmixp_info_namespace(), pmixp_info_nodeid());

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	_progres_fan_out(coll, buf);

	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	/* We may already start next collective. Try to progress!
	 * its OK if we in SYNC - there will be no-op */
	_progress_fan_in(coll);
}
Beispiel #5
0
static int _pmixp_server_cperf_iter(char *data, int ndata)
{
	pmixp_coll_t *coll;
	pmixp_proc_t procs;
	int cur_count = _pmixp_server_cperf_count();

	strncpy(procs.nspace, pmixp_info_namespace(), PMIXP_MAX_NSLEN);
	procs.rank = pmixp_lib_get_wildcard();

	coll = pmixp_state_coll_get(PMIXP_COLL_TYPE_FENCE, &procs, 1);
	xassert(!pmixp_coll_contrib_local(coll, data, ndata,
					  _pmixp_cperf_cbfunc, NULL));

	while (cur_count == _pmixp_server_cperf_count()) {
		usleep(1);
	}
	return 0;
}
Beispiel #6
0
static void _respond_with_error(int seq_num, char *sender_host,
				char *sender_ns, int status)
{
	Buf buf = create_buf(NULL, 0);
	char *addr;
	int rc;

	/* rank doesn't matter here, don't send it */
	_setup_header(buf, DMDX_RESPONSE, pmixp_info_namespace(), -1, status);
	/* generate namespace usocket name */
	addr = pmixp_info_nspace_usock(sender_ns);
	/* send response */
	rc = pmixp_server_send(sender_host, PMIXP_MSG_DMDX, seq_num, addr,
			get_buf_data(buf), get_buf_offset(buf), 1);
	if (SLURM_SUCCESS != rc) {
		PMIXP_ERROR("Cannot send direct modex error" " response to %s",
				sender_host);
	}
	xfree(addr);
	free_buf(buf);
}
Beispiel #7
0
static void _setup_header(Buf buf, dmdx_type_t t,
			  const char *nspace, int rank, int status)
{
	char *str;
	/* 1. pack message type */
	unsigned char type = (char)t;
	grow_buf(buf, sizeof(char));
	pack8(type, buf);

	/* 2. pack namespace _with_ '\0' (strlen(nspace) + 1)! */
	packmem((char *)nspace, strlen(nspace) + 1, buf);

	/* 3. pack rank */
	grow_buf(buf, sizeof(int));
	pack32((uint32_t)rank, buf);

	/* 4. pack my rendezvous point - local namespace
	 * ! _with_ '\0' (strlen(nspace) + 1) ! */
	str = pmixp_info_namespace();
	packmem(str, strlen(str) + 1, buf);

	/* 5. pack the status */
	pack32((uint32_t)status, buf);
}
Beispiel #8
0
static void _dmdx_req(Buf buf, char *sender_host, uint32_t seq_num)
{
	int rank, rc;
	int status;
	char *ns = NULL, *sender_ns = NULL;
	pmixp_namespace_t *nsptr;
	dmdx_caddy_t *caddy = NULL;

	rc = _read_info(buf, &ns, &rank, &sender_ns,&status);
	if (SLURM_SUCCESS != rc) {
		/* there is not much we can do here, but data corruption shouldn't happen */
		PMIXP_ERROR("Fail to unpack header data in" " request from %s, rc = %d",
			    sender_host, rc);
		goto exit;
	}

	if (0 != xstrcmp(ns, pmixp_info_namespace())) {
		/* request for namespase that is not controlled by this daemon
		 * considered as error. This may change in future.  */
		PMIXP_ERROR("Bad request from %s: asked for" " nspace = %s, mine is %s",
			    sender_host, ns, pmixp_info_namespace());
		_respond_with_error(seq_num, sender_host, sender_ns,
				PMIX_ERR_INVALID_NAMESPACE);
		goto exit;
	}

	nsptr = pmixp_nspaces_local();
	if (nsptr->ntasks <= rank) {
		PMIXP_ERROR("Bad request from %s: nspace \"%s\"" " has only %d ranks, asked for %d",
			    sender_host, ns, nsptr->ntasks, rank);
		_respond_with_error(seq_num, sender_host, sender_ns,
				PMIX_ERR_BAD_PARAM);
		goto exit;
	}

	/* setup temp structure to handle information fro _dmdx_pmix_cb */
	caddy = xmalloc(sizeof(dmdx_caddy_t));
	caddy->seq_num = seq_num;

	/* ns is a pointer inside incoming buffer */
	strncpy(caddy->proc.nspace, ns, PMIX_MAX_NSLEN);
	ns = NULL; /* protect the data */
	caddy->proc.rank = rank;

	/* sender_host was passed from outside - copy it */
	caddy->sender_host = xstrdup(sender_host);
	sender_host = NULL; /* protect the data */

	/* sender_ns is a pointer inside incoming buffer */
	caddy->sender_ns = xstrdup(sender_ns);
	sender_ns = NULL;

	rc = PMIx_server_dmodex_request(&caddy->proc, _dmdx_pmix_cb,
			(void *)caddy);
	if (PMIX_SUCCESS != rc) {
		PMIXP_ERROR("Can't request modex data from libpmix-server," "requesting host = %s, nspace = %s, rank = %d, rc = %d",
			    caddy->sender_host, caddy->proc.nspace,
			    caddy->proc.rank, rc);
		_respond_with_error(seq_num, caddy->sender_host,
				caddy->sender_ns, rc);
		_dmdx_free_caddy(caddy);
	}
exit:
	/* we don't need this buffer anymore */
	free_buf(buf);

	/* no sense to return errors, engine can't do anything
	 * anyway. We've notified libpmix, that's enough */
}
Beispiel #9
0
int pmixp_stepd_init(const stepd_step_rec_t *job, char ***env)
{
	char *path;
	int fd, rc;

	if (SLURM_SUCCESS != (rc = pmixp_info_set(job, env))) {
		PMIXP_ERROR("pmixp_info_set(job, env) failed");
		goto err_info;
	}

	/* Create UNIX socket for slurmd communication */
	path = pmixp_info_nspace_usock(pmixp_info_namespace());
	if (NULL == path) {
		PMIXP_ERROR("pmixp_info_nspace_usock: out-of-memory");
		rc = SLURM_ERROR;
		goto err_path;
	}
	if ((fd = pmixp_usock_create_srv(path)) < 0) {
		PMIXP_ERROR("pmixp_usock_create_srv");
		rc = SLURM_ERROR;
		goto err_usock;
	}
	fd_set_close_on_exec(fd);
	pmixp_info_srv_usock_set(path, fd);


	if (!pmixp_info_same_arch()){
		_direct_proto.hdr_unpack_cb = _direct_hdr_unpack_portable;
		_direct_hdr_pack = _direct_hdr_pack_portable;
	}

	pmixp_conn_init(_slurm_proto, _direct_proto);

	if((rc = pmixp_dconn_init(pmixp_info_nodes_uni(), _direct_proto)) ){
		PMIXP_ERROR("pmixp_dconn_init() failed");
		goto err_dconn;
	}

	if ((rc = pmixp_nspaces_init())) {
		PMIXP_ERROR("pmixp_nspaces_init() failed");
		goto err_nspaces;
	}

	if (SLURM_SUCCESS != (rc = pmixp_state_init())) {
		PMIXP_ERROR("pmixp_state_init() failed");
		goto err_state;
	}

	if (SLURM_SUCCESS != (rc = pmixp_dmdx_init())) {
		PMIXP_ERROR("pmixp_dmdx_init() failed");
		goto err_dmdx;
	}

	if (SLURM_SUCCESS != (rc = pmixp_libpmix_init())) {
		PMIXP_ERROR("pmixp_libpmix_init() failed");
		goto err_lib;
	}

	if (SLURM_SUCCESS != (rc = pmixp_libpmix_job_set())) {
		PMIXP_ERROR("pmixp_libpmix_job_set() failed");
		goto err_job;
	}

	pmixp_server_init_pp(env);
	pmixp_server_init_cperf(env);

	xfree(path);
	_was_initialized = 1;
	return SLURM_SUCCESS;

err_job:
	pmixp_libpmix_finalize();
err_lib:
	pmixp_dmdx_finalize();
err_dmdx:
	pmixp_state_finalize();
err_state:
	pmixp_nspaces_finalize();
err_nspaces:
	pmixp_dconn_fini();
err_dconn:
	pmixp_conn_fini();
	close(pmixp_info_srv_usock_fd());
err_usock:
	xfree(path);
err_path:
	pmixp_info_free();
err_info:
	return rc;
}
int pmixp_stepd_init(const stepd_step_rec_t *job, char ***env)
{
	char *path;
	int fd, rc;

	if (SLURM_SUCCESS != (rc = pmixp_info_set(job, env))) {
		PMIXP_ERROR("pmixp_info_set(job, env) failed");
		return rc;
	}

	/* Create UNIX socket for slurmd communication */
	path = pmixp_info_nspace_usock(pmixp_info_namespace());
	if (NULL == path) {
		PMIXP_ERROR("Out-of-memory");
		rc = SLURM_ERROR;
		goto err_path;
	}
	if ((fd = pmixp_usock_create_srv(path)) < 0) {
		rc = SLURM_ERROR;
		goto err_usock;
	}
	fd_set_close_on_exec(fd);
	pmixp_info_srv_contacts(path, fd);

	if (SLURM_SUCCESS != (rc = pmixp_nspaces_init())) {
		PMIXP_ERROR("pmixp_nspaces_init() failed");
		goto err_usock;
	}

	if (SLURM_SUCCESS != (rc = pmixp_state_init())) {
		PMIXP_ERROR("pmixp_state_init() failed");
		goto err_state;
	}

	if (SLURM_SUCCESS != (rc = pmixp_dmdx_init())) {
		PMIXP_ERROR("pmixp_dmdx_init() failed");
		goto err_dmdx;
	}

	if (SLURM_SUCCESS != (rc = pmixp_libpmix_init())) {
		PMIXP_ERROR("pmixp_libpmix_init() failed");
		goto err_lib;
	}

	if (SLURM_SUCCESS != (rc = pmixp_libpmix_job_set())) {
		PMIXP_ERROR("pmixp_libpmix_job_set() failed");
		goto err_job;
	}

	xfree(path);
	_was_initialized = 1;
	return SLURM_SUCCESS;

err_job:
	pmixp_libpmix_finalize();
err_lib:
	pmixp_dmdx_finalize();
err_dmdx:
	pmixp_state_finalize();
err_state:
	pmixp_nspaces_finalize();
err_usock:
	xfree(path);
err_path:
	pmixp_info_free();
	return rc;
}
Beispiel #11
0
int pmixp_libpmix_job_set(void)
{
	List lresp;
	pmix_info_t *info;
	int ninfo;
	ListIterator it;
	pmix_info_t *kvp;

	int i, rc;
	uid_t uid = pmixp_info_jobuid();
	gid_t gid = pmixp_info_jobgid();
	_register_caddy_t *register_caddy;

	register_caddy = xmalloc(sizeof(_register_caddy_t)*(pmixp_info_tasks_loc()+1));
	pmixp_debug_hang(0);

	/* Use list to safely expand/reduce key-value pairs. */
	lresp = list_create(pmixp_xfree_xmalloced);

	_general_proc_info(lresp);

	_set_tmpdirs(lresp);

	_set_procdatas(lresp);

	_set_sizeinfo(lresp);

	if (SLURM_SUCCESS != _set_mapsinfo(lresp)) {
		list_destroy(lresp);
		PMIXP_ERROR("Can't build nodemap");
		return SLURM_ERROR;
	}

	_set_localinfo(lresp);

	ninfo = list_count(lresp);
	PMIX_INFO_CREATE(info, ninfo);
	it = list_iterator_create(lresp);
	i = 0;
	while (NULL != (kvp = list_next(it))) {
		info[i] = *kvp;
		i++;
	}
	list_destroy(lresp);

	register_caddy[0].active = 1;
	rc = PMIx_server_register_nspace(pmixp_info_namespace(),
			pmixp_info_tasks_loc(), info, ninfo, _release_cb,
			&register_caddy[0]);

	if (PMIX_SUCCESS != rc) {
		PMIXP_ERROR("Cannot register namespace %s, nlocalproc=%d, "
			    "ninfo = %d", pmixp_info_namespace(),
			    pmixp_info_tasks_loc(), ninfo);
		return SLURM_ERROR;
	}

	PMIXP_DEBUG("task initialization");
	for (i = 0; i < pmixp_info_tasks_loc(); i++) {
		pmix_proc_t proc;
		register_caddy[i+1].active = 1;
		strncpy(proc.nspace, pmixp_info_namespace(), PMIX_MAX_NSLEN);
		proc.rank = pmixp_info_taskid(i);
		rc = PMIx_server_register_client(&proc, uid, gid, NULL,
				_release_cb, &register_caddy[i + 1]);
		if (PMIX_SUCCESS != rc) {
			PMIXP_ERROR("Cannot register client %d(%d) in namespace %s",
				    pmixp_info_taskid(i), i,
				    pmixp_info_namespace());
			return SLURM_ERROR;
		}
	}

	/* wait for all registration actions to finish */
	while( 1 ){
		int exit_flag = 1;
		struct timespec ts;
		ts.tv_sec = 0;
		ts.tv_nsec = 100;

		for(i=0; i <  pmixp_info_tasks_loc() + 1; i++){
			if( register_caddy[i].active ){
				exit_flag = 0;
			}
		}
		if( exit_flag ){
			break;
		}
		nanosleep(&ts, NULL);
	}
	PMIX_INFO_FREE(info, ninfo);
	xfree(register_caddy);

	return SLURM_SUCCESS;
}
Beispiel #12
0
static void _progress_fan_in(pmixp_coll_t *coll)
{
	pmixp_srv_cmd_t type;
	const char *addr = pmixp_info_srv_addr();
	char *hostlist = NULL;
	int rc, is_p2p = 0;
	Buf root_buf;

	PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d",
			pmixp_info_namespace(), pmixp_info_nodeid(),
			coll->contrib_local, coll->contrib_cntr);

	/* lock the collective */
	slurm_mutex_lock(&coll->lock);

	pmixp_coll_sanity_check(coll);

	if (PMIXP_COLL_FAN_IN != coll->state) {
		/* In case of race condition between libpmix and
		 * slurm threads progress_fan_in can be called
		 * after we moved to the next step. */
		goto unlock;
	}

	if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) {
		/* Not yet ready to go to the next step */
		goto unlock;
	}

	/* The root of the collective will have parent_host == NULL */
	if (NULL != coll->parent_host) {
		hostlist = xstrdup(coll->parent_host);
		type = PMIXP_MSG_FAN_IN;
		PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state",
			    pmixp_info_namespace(), pmixp_info_nodeid());
		is_p2p = 1;
	} else {
		if (0 < hostlist_count(coll->all_children)) {
			hostlist = hostlist_ranged_string_xmalloc(
					coll->all_children);
			type = PMIXP_MSG_FAN_OUT;
			pmixp_debug_hang(0);
		}
		rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf);
		xassert(0 == rc);
		PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)",
			    pmixp_info_namespace(), pmixp_info_nodeid());
	}

	PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(),
			pmixp_info_nodeid(), hostlist);

	/* Check for the singletone case */
	if (NULL != hostlist) {
		if( 0 == coll->seq && NULL != coll->parent_host ){
			/* This is the first message sent to the parent.
			 * There might be a race condition where parent
			 * is not ready to receive the messages.
			 * Use zero-size message to check parent status first
			 * and then send the full message.
			 */
			pmixp_server_health_chk(hostlist, addr);
		}
		rc = pmixp_server_send(hostlist, type, coll->seq, addr,
				get_buf_data(coll->buf),
				get_buf_offset(coll->buf), is_p2p);

		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR(
					"Cannot send data (size = %lu), to hostlist:\n%s",
					(uint64_t) get_buf_offset(coll->buf),
					hostlist);
			/* return error indication to PMIx. Nodes that haven't received data
			 * will exit by a timeout.
			 * FIXME: do we need to do something with successfuly finished nodes?
			 */
			goto unlock;
		}
	}

	/* transit to the next state */
	_fan_in_finished(coll);

	/* if we are root - push data to PMIx here.
	 * Originally there was a homogenuous solution: root nodename was in the hostlist.
	 * However this may lead to the undesired side effects: we are blocked here sending
	 * data and cannot receive (it will be triggered in this thread after we will leave
	 * this callback), so we have to rely on buffering on the SLURM side.
	 * Better not to do so. */
	if (NULL == coll->parent_host) {
		/* if I am the root - pass the data to PMIx and reset collective here */
		/* copy payload excluding reserved server header */
		_progres_fan_out(coll, root_buf);
	}

unlock:
	if (NULL != hostlist) {
		xfree(hostlist);
	}

	/* lock the */
	slurm_mutex_unlock(&coll->lock);
}
Beispiel #13
0
int pmixp_coll_contrib_node(pmixp_coll_t *coll, char *nodename, Buf buf)
{
	int nodeid;
	char *data = NULL;
	uint32_t size;
	char *state = NULL;

	PMIXP_DEBUG("%s:%d: get contribution from node %s",
			pmixp_info_namespace(), pmixp_info_nodeid(), nodename);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

	pmixp_coll_sanity_check(coll);

	/* fix the collective status if need */
	if (PMIXP_COLL_SYNC == coll->state) {
		PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_IN",
			    pmixp_info_namespace(), pmixp_info_nodeid(),
			    nodename);
		coll->state = PMIXP_COLL_FAN_IN;
		coll->ts = time(NULL);
	} else if( PMIXP_COLL_FAN_OUT == coll->state) {
		PMIXP_DEBUG("%s:%d: get contribution from node %s: switch to PMIXP_COLL_FAN_OUT_IN"
			    " (next collective!)",
			    pmixp_info_namespace(), pmixp_info_nodeid(),
			    nodename);
		coll->state = PMIXP_COLL_FAN_OUT_IN;
		coll->ts_next = time(NULL);
	}
	xassert(PMIXP_COLL_FAN_IN == coll->state || PMIXP_COLL_FAN_OUT_IN == coll->state);

	/* Because of possible timeouts/delays in transmission we
	 * can receive a contribution second time. Avoid duplications
	 * by checking our records. */
	nodeid = hostlist_find(coll->ch_hosts, nodename);
	xassert(0 <= nodeid);
	if (0 > nodeid) {
		/* protect ourselfs if we are running with no asserts */
		goto proceed;
	}

	if (0 < coll->ch_contribs[nodeid]) {
		/* May be 0 or 1. If grater - transmission skew, ignore. */
		PMIXP_DEBUG("Multiple contributions from child_id=%d, hostname=%s",
			    nodeid, nodename);
		/* this is duplication, skip. */
		goto proceed;
	}

	data = get_buf_data(buf) + get_buf_offset(buf);
	size = remaining_buf(buf);
	grow_buf(coll->buf, size);
	memcpy(get_buf_data(coll->buf) + get_buf_offset(coll->buf), data, size);
	set_buf_offset(coll->buf, get_buf_offset(coll->buf) + size);

	/* increase number of individual contributions */
	coll->ch_contribs[nodeid]++;

	/* increase number of total contributions */
	coll->contrib_cntr++;

proceed:
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);

	if( PMIXP_COLL_FAN_IN == coll->state ){
		/* make a progress if we are in fan-in state */
		_progress_fan_in(coll);
	}

	switch( coll->state ){
	case PMIXP_COLL_SYNC:
		state = "sync";
		break;
	case PMIXP_COLL_FAN_IN:
		state = "fan-in";
		break;
	case PMIXP_COLL_FAN_OUT:
		state = "fan-out";
		break;
	case PMIXP_COLL_FAN_OUT_IN:
		state = "fan-out-in";
		break;
	}

	PMIXP_DEBUG("%s:%d: get contribution from node %s: finish. State = %s",
		    pmixp_info_namespace(), pmixp_info_nodeid(), nodename,
		    state);

	return SLURM_SUCCESS;
}