Exemple #1
0
/* This set of locks it designed to prevent race conditions when changing
 * CPU frequency or govorner. Specifically, when a job ends it should only
 * reset CPU frequency if it was the last job to set the CPU frequency.
 * with gang scheduling and cancellation of suspended or running jobs there
 * can be timing issues.
 * _set_cpu_owner_lock  - set specified job to own the CPU, this CPU file is
 *	locked on exit
 * _test_cpu_owner_lock - test if the specified job owns the CPU, this CPU is
 *	locked on return with true
 */
static int _set_cpu_owner_lock(int cpu_id, uint32_t job_id)
{
	char tmp[64];
	int fd, sz;

	if (!slurmd_spooldir)
		slurmd_spooldir = slurm_get_slurmd_spooldir();

	snprintf(tmp, sizeof(tmp), "%s/cpu", slurmd_spooldir);
	if ((mkdir(tmp, 0700) != 0) && (errno != EEXIST)) {
		error("mkdir failed: %m %s",tmp);
		return -1;
	}
	snprintf(tmp, sizeof(tmp), "%s/cpu/%d", slurmd_spooldir, cpu_id);
	fd = open(tmp, O_CREAT | O_RDWR, 0600);
	if (fd < 0) {
		error("%s: open: %m %s", __func__, tmp);
		return fd;
	}
	if (_fd_lock_retry(fd) < 0)
		error("%s: fd_get_write_lock: %m %s", __func__, tmp);
	sz = sizeof(uint32_t);
	if (fd_write_n(fd, (void *) &job_id, sz) != sz)
		error("%s: write: %m %s", __func__, tmp);

	return fd;
}
Exemple #2
0
static int _test_cpu_owner_lock(int cpu_id, uint32_t job_id)
{
	char tmp[64];
	uint32_t in_job_id;
	int fd, sz;

	if (!slurmd_spooldir)
		slurmd_spooldir = slurm_get_slurmd_spooldir();

	snprintf(tmp, sizeof(tmp), "%s/cpu", slurmd_spooldir);
	if ((mkdir(tmp, 0700) != 0) && (errno != EEXIST)) {
		error("%s: mkdir failed: %m %s", __func__, tmp);
		return -1;
	}
	snprintf(tmp, sizeof(tmp), "%s/cpu/%d", slurmd_spooldir, cpu_id);
	fd = open(tmp, O_RDWR, 0600);
	if (fd < 0) {
		if (errno != ENOENT)	/* Race condition */
			error("%s: open: %m %s", __func__, tmp);
		return -1;
	}
	if (_fd_lock_retry(fd) < 0) {
		error("%s: fd_get_write_lock: %m %s", __func__, tmp);
		close(fd);
		return -1;
	}
	sz = sizeof(uint32_t);
	if (fd_read_n(fd, (void *) &in_job_id, sz) != sz) {
		error("%s: read: %m %s", __func__, tmp);
		close(fd);
		return -1;
	}
	if (job_id != in_job_id) {
		/* Result of various race conditions */
		debug("%s: CPU %d now owned by job %u rather than job %u",
		      __func__, cpu_id, in_job_id, job_id);
		close(fd);
		return -1;
	}
	debug("%s: CPU %d owned by job %u as expected",
	      __func__, cpu_id, job_id);

	return fd;
}
Exemple #3
0
static int _env_set(char ***env)
{
	char *p = NULL;

	xassert(_pmixp_job_info.hostname);

	_pmixp_job_info.server_addr_unfmt = slurm_get_slurmd_spooldir(NULL);

	_pmixp_job_info.lib_tmpdir = slurm_conf_expand_slurmd_path(
		_pmixp_job_info.server_addr_unfmt, _pmixp_job_info.hostname);

	xstrfmtcat(_pmixp_job_info.server_addr_unfmt, "/stepd.slurm.pmix.%d.%d",
		   pmixp_info_jobid(), pmixp_info_stepid());

	_pmixp_job_info.spool_dir = xstrdup(_pmixp_job_info.lib_tmpdir);

	/* ----------- Temp directories settings ------------- */
	xstrfmtcat(_pmixp_job_info.lib_tmpdir, "/pmix.%d.%d/",
		   pmixp_info_jobid(), pmixp_info_stepid());

	/* save client temp directory if requested
	 * TODO: We want to get TmpFS value as well if exists.
	 * Need to sync with SLURM developers.
	 */
	p = getenvp(*env, PMIXP_TMPDIR_CLI);

	if (p)
		_pmixp_job_info.cli_tmpdir_base = xstrdup(p);
	else
		_pmixp_job_info.cli_tmpdir_base = slurm_get_tmp_fs(
			_pmixp_job_info.hostname);

	_pmixp_job_info.cli_tmpdir =
		xstrdup_printf("%s/spmix_appdir_%d.%d",
			       _pmixp_job_info.cli_tmpdir_base,
			       pmixp_info_jobid(), pmixp_info_stepid());


	/* ----------- Timeout setting ------------- */
	/* TODO: also would be nice to have a cluster-wide setting in SLURM */
	_pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT;
	p = getenvp(*env, PMIXP_TIMEOUT);
	if (NULL != p) {
		int tmp;
		tmp = atoi(p);
		if (tmp > 0) {
			_pmixp_job_info.timeout = tmp;
		}
	}

	/* ----------- Forward PMIX settings ------------- */
	/* FIXME: this may be intrusive as well as PMIx library will create
	 * lots of output files in /tmp by default.
	 * somebody can use this or annoyance */
	p = getenvp(*env, PMIXP_PMIXLIB_DEBUG);
	if (NULL != p) {
		setenv(PMIXP_PMIXLIB_DEBUG, p, 1);
		/* output into the file since we are in slurmstepd
		 * and stdout is muted.
		 * One needs to check TMPDIR for the results */
		setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1);
	}

	return SLURM_SUCCESS;
}