/*
 * task_p_slurmd_suspend_job()
 */
extern int task_p_slurmd_suspend_job (uint32_t job_id)
{
	debug("task_p_slurmd_suspend_job: %u", job_id);

#ifdef HAVE_NATIVE_CRAY
	_step_epilogue();
#endif

	return SLURM_SUCCESS;
}
Exemplo n.º 2
0
/*
 * task_p_slurmd_suspend_job()
 */
extern int task_p_slurmd_suspend_job (uint32_t job_id)
{
	DEF_TIMERS;
	START_TIMER;
	debug("task_p_slurmd_suspend_job: %u", job_id);

#ifdef HAVE_NATIVE_CRAY
	_step_epilogue();
#endif
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
		INFO_LINE("call took: %s", TIME_STR);

	return SLURM_SUCCESS;
}
/*
 * task_p_post_step() is called after termination of the step
 * (all the tasks)
 */
extern int task_p_post_step (stepd_step_rec_t *job)
{
#ifdef HAVE_NATIVE_CRAY
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	int rc, cnt;
	char *err_msg = NULL, path[PATH_MAX];
	int32_t *numa_nodes;
	cpu_set_t *cpuMasks;

	if (track_status) {
		// Get the lli file name
		snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
			 SLURM_ID_HASH(job->jobid, job->stepid));

		// Unlink the file
		errno = 0;
		rc = unlink(llifile);
		if (rc == -1 && errno != ENOENT) {
			CRAY_ERR("unlink(%s) failed: %m", llifile);
		} else if (rc == 0) {
			info("Unlinked %s", llifile);
		}
	}

	/*
	 * Compact Memory
	 *
	 * Determine which NUMA nodes and CPUS an application is using.  It will
	 * be used to compact the memory.
	 *
	 * You'll find the information in the following location.
	 * For a normal job step:
	 * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/
	 *
	 * For a batch job step (only on the head node and only for batch jobs):
	 * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/
	 *
	 * NUMA node: mems
	 * CPU Masks: cpus
	 */
	if (job->batch) {
		// Batch Job Step
		rc = snprintf(path, sizeof(path),
			      "/dev/cpuset/slurm/uid_%d/job_%"
			      PRIu32 "/step_batch", job->uid, job->jobid);
		if (rc < 0) {
			CRAY_ERR("snprintf failed. Return code: %d", rc);
			return SLURM_ERROR;
		}
	} else {
		// Normal Job Step

		/* Only run epilogue on non-batch steps */
		_step_epilogue();

		rc = snprintf(path, sizeof(path),
			      "/dev/cpuset/slurm/uid_%d/job_%"
			      PRIu32 "/step_%" PRIu32,
			      job->uid, job->jobid, job->stepid);
		if (rc < 0) {
			CRAY_ERR("snprintf failed. Return code: %d", rc);
			return SLURM_ERROR;
		}
	}

	rc = _get_numa_nodes(path, &cnt, &numa_nodes);
	if (rc < 0) {
		CRAY_ERR("get_numa_nodes failed. Return code: %d", rc);
		return SLURM_ERROR;
	}

	rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks);
	if (rc < 0) {
		CRAY_ERR("get_cpu_masks failed. Return code: %d", rc);
		return SLURM_ERROR;
	}

	/*
	 * Compact Memory
	 * The last argument which is a path to the cpuset directory has to be
	 * NULL because the CPUSET directory has already been cleaned up.
	 */
	rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL);
	_ALPSC_DEBUG("alpsc_compact_mem");

	xfree(numa_nodes);
	xfree(cpuMasks);

	if (rc != 1) {
		return SLURM_ERROR;
	}
#endif
	return SLURM_SUCCESS;
}