/* This set of locks it designed to prevent race conditions when changing * CPU frequency or govorner. Specifically, when a job ends it should only * reset CPU frequency if it was the last job to set the CPU frequency. * with gang scheduling and cancellation of suspended or running jobs there * can be timing issues. * _set_cpu_owner_lock - set specified job to own the CPU, this CPU file is * locked on exit * _test_cpu_owner_lock - test if the specified job owns the CPU, this CPU is * locked on return with true */ static int _set_cpu_owner_lock(int cpu_id, uint32_t job_id) { char tmp[64]; int fd, sz; if (!slurmd_spooldir) slurmd_spooldir = slurm_get_slurmd_spooldir(); snprintf(tmp, sizeof(tmp), "%s/cpu", slurmd_spooldir); if ((mkdir(tmp, 0700) != 0) && (errno != EEXIST)) { error("mkdir failed: %m %s",tmp); return -1; } snprintf(tmp, sizeof(tmp), "%s/cpu/%d", slurmd_spooldir, cpu_id); fd = open(tmp, O_CREAT | O_RDWR, 0600); if (fd < 0) { error("%s: open: %m %s", __func__, tmp); return fd; } if (_fd_lock_retry(fd) < 0) error("%s: fd_get_write_lock: %m %s", __func__, tmp); sz = sizeof(uint32_t); if (fd_write_n(fd, (void *) &job_id, sz) != sz) error("%s: write: %m %s", __func__, tmp); return fd; }
static int _test_cpu_owner_lock(int cpu_id, uint32_t job_id) { char tmp[64]; uint32_t in_job_id; int fd, sz; if (!slurmd_spooldir) slurmd_spooldir = slurm_get_slurmd_spooldir(); snprintf(tmp, sizeof(tmp), "%s/cpu", slurmd_spooldir); if ((mkdir(tmp, 0700) != 0) && (errno != EEXIST)) { error("%s: mkdir failed: %m %s", __func__, tmp); return -1; } snprintf(tmp, sizeof(tmp), "%s/cpu/%d", slurmd_spooldir, cpu_id); fd = open(tmp, O_RDWR, 0600); if (fd < 0) { if (errno != ENOENT) /* Race condition */ error("%s: open: %m %s", __func__, tmp); return -1; } if (_fd_lock_retry(fd) < 0) { error("%s: fd_get_write_lock: %m %s", __func__, tmp); close(fd); return -1; } sz = sizeof(uint32_t); if (fd_read_n(fd, (void *) &in_job_id, sz) != sz) { error("%s: read: %m %s", __func__, tmp); close(fd); return -1; } if (job_id != in_job_id) { /* Result of various race conditions */ debug("%s: CPU %d now owned by job %u rather than job %u", __func__, cpu_id, in_job_id, job_id); close(fd); return -1; } debug("%s: CPU %d owned by job %u as expected", __func__, cpu_id, job_id); return fd; }
static int _env_set(char ***env) { char *p = NULL; xassert(_pmixp_job_info.hostname); _pmixp_job_info.server_addr_unfmt = slurm_get_slurmd_spooldir(NULL); _pmixp_job_info.lib_tmpdir = slurm_conf_expand_slurmd_path( _pmixp_job_info.server_addr_unfmt, _pmixp_job_info.hostname); xstrfmtcat(_pmixp_job_info.server_addr_unfmt, "/stepd.slurm.pmix.%d.%d", pmixp_info_jobid(), pmixp_info_stepid()); _pmixp_job_info.spool_dir = xstrdup(_pmixp_job_info.lib_tmpdir); /* ----------- Temp directories settings ------------- */ xstrfmtcat(_pmixp_job_info.lib_tmpdir, "/pmix.%d.%d/", pmixp_info_jobid(), pmixp_info_stepid()); /* save client temp directory if requested * TODO: We want to get TmpFS value as well if exists. * Need to sync with SLURM developers. */ p = getenvp(*env, PMIXP_TMPDIR_CLI); if (p) _pmixp_job_info.cli_tmpdir_base = xstrdup(p); else _pmixp_job_info.cli_tmpdir_base = slurm_get_tmp_fs( _pmixp_job_info.hostname); _pmixp_job_info.cli_tmpdir = xstrdup_printf("%s/spmix_appdir_%d.%d", _pmixp_job_info.cli_tmpdir_base, pmixp_info_jobid(), pmixp_info_stepid()); /* ----------- Timeout setting ------------- */ /* TODO: also would be nice to have a cluster-wide setting in SLURM */ _pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT; p = getenvp(*env, PMIXP_TIMEOUT); if (NULL != p) { int tmp; tmp = atoi(p); if (tmp > 0) { _pmixp_job_info.timeout = tmp; } } /* ----------- Forward PMIX settings ------------- */ /* FIXME: this may be intrusive as well as PMIx library will create * lots of output files in /tmp by default. * somebody can use this or annoyance */ p = getenvp(*env, PMIXP_PMIXLIB_DEBUG); if (NULL != p) { setenv(PMIXP_PMIXLIB_DEBUG, p, 1); /* output into the file since we are in slurmstepd * and stdout is muted. * One needs to check TMPDIR for the results */ setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1); } return SLURM_SUCCESS; }