Exemple #1
0
static int _env_set(char ***env)
{
	char *p = NULL;

	xassert(_pmixp_job_info.hostname);

	_pmixp_job_info.server_addr_unfmt = slurm_get_slurmd_spooldir(NULL);

	_pmixp_job_info.lib_tmpdir = slurm_conf_expand_slurmd_path(
		_pmixp_job_info.server_addr_unfmt, _pmixp_job_info.hostname);

	xstrfmtcat(_pmixp_job_info.server_addr_unfmt, "/stepd.slurm.pmix.%d.%d",
		   pmixp_info_jobid(), pmixp_info_stepid());

	_pmixp_job_info.spool_dir = xstrdup(_pmixp_job_info.lib_tmpdir);

	/* ----------- Temp directories settings ------------- */
	xstrfmtcat(_pmixp_job_info.lib_tmpdir, "/pmix.%d.%d/",
		   pmixp_info_jobid(), pmixp_info_stepid());

	/* save client temp directory if requested
	 * TODO: We want to get TmpFS value as well if exists.
	 * Need to sync with SLURM developers.
	 */
	p = getenvp(*env, PMIXP_TMPDIR_CLI);

	if (p)
		_pmixp_job_info.cli_tmpdir_base = xstrdup(p);
	else
		_pmixp_job_info.cli_tmpdir_base = slurm_get_tmp_fs(
			_pmixp_job_info.hostname);

	_pmixp_job_info.cli_tmpdir =
		xstrdup_printf("%s/spmix_appdir_%d.%d",
			       _pmixp_job_info.cli_tmpdir_base,
			       pmixp_info_jobid(), pmixp_info_stepid());


	/* ----------- Timeout setting ------------- */
	/* TODO: also would be nice to have a cluster-wide setting in SLURM */
	_pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT;
	p = getenvp(*env, PMIXP_TIMEOUT);
	if (NULL != p) {
		int tmp;
		tmp = atoi(p);
		if (tmp > 0) {
			_pmixp_job_info.timeout = tmp;
		}
	}

	/* ----------- Forward PMIX settings ------------- */
	/* FIXME: this may be intrusive as well as PMIx library will create
	 * lots of output files in /tmp by default.
	 * somebody can use this or annoyance */
	p = getenvp(*env, PMIXP_PMIXLIB_DEBUG);
	if (NULL != p) {
		setenv(PMIXP_PMIXLIB_DEBUG, p, 1);
		/* output into the file since we are in slurmstepd
		 * and stdout is muted.
		 * One needs to check TMPDIR for the results */
		setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1);
	}

	return SLURM_SUCCESS;
}
Exemple #2
0
/*
 * Connect to a slurmstepd proccess by way of its unix domain socket.
 *
 * Both "directory" and "nodename" may be null, in which case stepd_connect
 * will attempt to determine them on its own.  If you are using multiple
 * slurmd on one node (unusual outside of development environments), you
 * will get one of the local NodeNames more-or-less at random.
 *
 * Returns a socket descriptor for the opened socket on success,
 * and -1 on error.
 */
int
stepd_connect(const char *directory, const char *nodename,
	      uint32_t jobid, uint32_t stepid)
{
	int req = REQUEST_CONNECT;
	int fd = -1;
	int rc;
	void *auth_cred;
	Buf buffer;
	int len;

	if (nodename == NULL) {
		if (!(nodename = _guess_nodename()))
			return -1;
	}
	if (directory == NULL) {
		slurm_ctl_conf_t *cf;

		cf = slurm_conf_lock();
		directory = slurm_conf_expand_slurmd_path(
			cf->slurmd_spooldir, nodename);
		slurm_conf_unlock();
	}

	buffer = init_buf(0);
	/* Create an auth credential */
	auth_cred = g_slurm_auth_create(NULL, 2, NULL);
	if (auth_cred == NULL) {
		error("Creating authentication credential: %s",
		      g_slurm_auth_errstr(g_slurm_auth_errno(NULL)));
		slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR);
		goto fail1;
	}

	/* Pack the auth credential */
	rc = g_slurm_auth_pack(auth_cred, buffer);
	(void) g_slurm_auth_destroy(auth_cred);
	if (rc) {
		error("Packing authentication credential: %s",
		      g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred)));
		slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR);
		goto fail1;
	}

	/* Connect to the step */
	fd = _step_connect(directory, nodename, jobid, stepid);
	if (fd == -1)
		goto fail1;

	safe_write(fd, &req, sizeof(int));
	len = size_buf(buffer);
	safe_write(fd, &len, sizeof(int));
	safe_write(fd, get_buf_data(buffer), len);

	safe_read(fd, &rc, sizeof(int));
	if (rc < 0) {
		error("slurmstepd refused authentication: %m");
		slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR);
		goto rwfail;
	}

	free_buf(buffer);
	return fd;

rwfail:
	close(fd);
fail1:
	free_buf(buffer);
	return -1;
}
Exemple #3
0
/*
 * Scan for available running slurm step daemons by checking
 * "directory" for unix domain sockets with names beginning in "nodename".
 *
 * Both "directory" and "nodename" may be null, in which case stepd_available
 * will attempt to determine them on its own.  If you are using multiple
 * slurmd on one node (unusual outside of development environments), you
 * will get one of the local NodeNames more-or-less at random.
 *
 * Returns a List of pointers to step_loc_t structures.
 */
List
stepd_available(const char *directory, const char *nodename)
{
	List l;
	DIR *dp;
	struct dirent *ent;
	regex_t re;
	struct stat stat_buf;

	if (nodename == NULL) {
		if (!(nodename = _guess_nodename()))
			return NULL;
	}
	if (directory == NULL) {
		slurm_ctl_conf_t *cf;

		cf = slurm_conf_lock();
		directory = slurm_conf_expand_slurmd_path(
			cf->slurmd_spooldir, nodename);
		slurm_conf_unlock();
	}

	l = list_create((ListDelF) _free_step_loc_t);
	if (_sockname_regex_init(&re, nodename) == -1)
		goto done;

	/*
	 * Make sure that "directory" exists and is a directory.
	 */
	if (stat(directory, &stat_buf) < 0) {
		error("Domain socket directory %s: %m", directory);
		goto done;
	} else if (!S_ISDIR(stat_buf.st_mode)) {
		error("%s is not a directory", directory);
		goto done;
	}

	if ((dp = opendir(directory)) == NULL) {
		error("Unable to open directory: %m");
		goto done;
	}

	while ((ent = readdir(dp)) != NULL) {
		step_loc_t *loc;
		uint32_t jobid, stepid;

		if (_sockname_regex(&re, ent->d_name, &jobid, &stepid) == 0) {
			debug4("found jobid = %u, stepid = %u", jobid, stepid);
			loc = xmalloc(sizeof(step_loc_t));
			loc->directory = xstrdup(directory);
			loc->nodename = xstrdup(nodename);
			loc->jobid = jobid;
			loc->stepid = stepid;
			list_append(l, (void *)loc);
		}
	}

	closedir(dp);
done:
	regfree(&re);
	return l;
}
Exemple #4
0
/*
 * Connect to a slurmstepd proccess by way of its unix domain socket.
 *
 * Both "directory" and "nodename" may be null, in which case stepd_connect
 * will attempt to determine them on its own.  If you are using multiple
 * slurmd on one node (unusual outside of development environments), you
 * will get one of the local NodeNames more-or-less at random.
 *
 * Returns a socket descriptor for the opened socket on success,
 * and -1 on error.
 */
int
stepd_connect(const char *directory, const char *nodename,
	      uint32_t jobid, uint32_t stepid, uint16_t *protocol_version)
{
	int req = REQUEST_CONNECT;
	int fd = -1;
	int rc;
	void *auth_cred;
	Buf buffer;
	int len;

	*protocol_version = 0;

	if (nodename == NULL) {
		if (!(nodename = _guess_nodename()))
			return -1;
	}
	if (directory == NULL) {
		slurm_ctl_conf_t *cf;

		cf = slurm_conf_lock();
		directory = slurm_conf_expand_slurmd_path(
			cf->slurmd_spooldir, nodename);
		slurm_conf_unlock();
	}

	buffer = init_buf(0);
	/* Create an auth credential */
	auth_cred = g_slurm_auth_create(NULL, 2, NULL);
	if (auth_cred == NULL) {
		error("Creating authentication credential: %s",
		      g_slurm_auth_errstr(g_slurm_auth_errno(NULL)));
		slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR);
		goto fail1;
	}

	/* Pack the auth credential */
	rc = g_slurm_auth_pack(auth_cred, buffer);
	(void) g_slurm_auth_destroy(auth_cred);
	if (rc) {
		error("Packing authentication credential: %s",
		      g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred)));
		slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR);
		goto fail1;
	}

	/* Connect to the step */
	fd = _step_connect(directory, nodename, jobid, stepid);
	if (fd == -1)
		goto fail1;

	safe_write(fd, &req, sizeof(int));
	len = size_buf(buffer);
	safe_write(fd, &len, sizeof(int));
	safe_write(fd, get_buf_data(buffer), len);

	safe_read(fd, &rc, sizeof(int));
	if (rc < 0) {
		error("slurmstepd refused authentication: %m");
		slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR);
		goto rwfail;
	} else if (rc)
		*protocol_version = rc;
	else {
		/* 0n older versions of Slurm < 14.11 SLURM_SUCCESS
		 * was returned here instead of the protocol version.
		 * This can be removed when we are 2 versions past
		 * 14.11.
		 */
		slurmstepd_info_t *stepd_info = stepd_get_info(fd);
		*protocol_version = stepd_info->protocol_version;
		xfree(stepd_info);
	}

	free_buf(buffer);
	return fd;

rwfail:
	close(fd);
fail1:
	free_buf(buffer);
	return -1;
}