/* * task_p_post_step() is called after termination of the step * (all the tasks) */ extern int task_p_post_step (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY char llifile[LLI_STATUS_FILE_BUF_SIZE]; int rc, cnt; char *err_msg = NULL, path[PATH_MAX]; int32_t *numa_nodes; cpu_set_t *cpuMasks; if (track_status) { // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Unlink the file errno = 0; rc = unlink(llifile); if (rc == -1 && errno != ENOENT) { CRAY_ERR("unlink(%s) failed: %m", llifile); } else if (rc == 0) { info("Unlinked %s", llifile); } } /* * Compact Memory * * Determine which NUMA nodes and CPUS an application is using. It will * be used to compact the memory. * * You'll find the information in the following location. * For a normal job step: * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/ * * For a batch job step (only on the head node and only for batch jobs): * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/ * * NUMA node: mems * CPU Masks: cpus */ if (job->batch) { // Batch Job Step rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_batch", job->uid, job->jobid); if (rc < 0) { CRAY_ERR("snprintf failed. Return code: %d", rc); return SLURM_ERROR; } } else { // Normal Job Step /* Only run epilogue on non-batch steps */ _step_epilogue(); rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_%" PRIu32, job->uid, job->jobid, job->stepid); if (rc < 0) { CRAY_ERR("snprintf failed. Return code: %d", rc); return SLURM_ERROR; } } rc = _get_numa_nodes(path, &cnt, &numa_nodes); if (rc < 0) { CRAY_ERR("get_numa_nodes failed. Return code: %d", rc); return SLURM_ERROR; } rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks); if (rc < 0) { CRAY_ERR("get_cpu_masks failed. Return code: %d", rc); return SLURM_ERROR; } /* * Compact Memory * The last argument which is a path to the cpuset directory has to be * NULL because the CPUSET directory has already been cleaned up. */ rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL); _ALPSC_DEBUG("alpsc_compact_mem"); xfree(numa_nodes); xfree(cpuMasks); if (rc != 1) { return SLURM_ERROR; } #endif return SLURM_SUCCESS; }
/* * task_p_post_step() is called after termination of the step * (all the tasks) */ extern int task_p_post_step (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY char llifile[LLI_STATUS_FILE_BUF_SIZE]; int rc, cnt; char *err_msg = NULL, path[PATH_MAX]; int32_t *numa_nodes; cpu_set_t *cpuMasks; // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Unlink the file errno = 0; rc = unlink(llifile); if (rc == -1 && errno != ENOENT) { error("%s: unlink(%s) failed: %m", __func__, llifile); } else if (rc == 0) { info("Unlinked %s", llifile); } /* * Compact Memory * * Determine which NUMA nodes and CPUS an application is using. It will * be used to compact the memory. * * You'll find the information in the following location. * For a normal job step: * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/ * * For a batch job step (only on the head node and only for batch jobs): * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/ * * NUMA node: mems * CPU Masks: cpus */ if ((job->stepid == NO_VAL) || (job->stepid == SLURM_BATCH_SCRIPT)) { // Batch Job Step rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_batch", job->uid, job->jobid); if (rc < 0) { error("(%s: %d: %s) snprintf failed. Return code: %d", THIS_FILE, __LINE__, __FUNCTION__, rc); return SLURM_ERROR; } } else { // Normal Job Step rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_%" PRIu32, job->uid, job->jobid, job->stepid); if (rc < 0) { error("(%s: %d: %s) snprintf failed. Return code: %d", THIS_FILE, __LINE__, __FUNCTION__, rc); return SLURM_ERROR; } } rc = _get_numa_nodes(path, &cnt, &numa_nodes); if (rc < 0) { error("(%s: %d: %s) get_numa_nodes failed. Return code: %d", THIS_FILE, __LINE__, __FUNCTION__, rc); return SLURM_ERROR; } rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks); if (rc < 0) { error("(%s: %d: %s) get_cpu_masks failed. Return code: %d", THIS_FILE, __LINE__, __FUNCTION__, rc); return SLURM_ERROR; } /* * Compact Memory * The last argument which is a path to the cpuset directory has to be * NULL because the CPUSET directory has already been cleaned up. */ rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL); xfree(numa_nodes); xfree(cpuMasks); if (rc != 1) { if (err_msg) { error("(%s: %d: %s) alpsc_compact_mem failed: %s", THIS_FILE, __LINE__, __FUNCTION__, err_msg); free(err_msg); } else { error("(%s: %d: %s) alpsc_compact_mem failed:" " No error message present.", THIS_FILE, __LINE__, __FUNCTION__); } return SLURM_ERROR; } if (err_msg) { info("(%s: %d: %s) alpsc_compact_mem: %s", THIS_FILE, __LINE__, __FUNCTION__, err_msg); free(err_msg); } #endif return SLURM_SUCCESS; }