/* * Set the count of specialized cores at job start * * Return SLURM_SUCCESS on success */ extern int core_spec_p_set(uint64_t cont_id, uint16_t core_count) { #if _DEBUG info("core_spec_p_set(%"PRIu64") to %u", cont_id, core_count); #endif #ifdef HAVE_NATIVE_CRAY int rc; struct job_set_affinity_info affinity_info; pid_t pid; int i; // Skip core spec setup for no specialized cores if ((core_count == (uint16_t) NO_VAL) || (core_count < 1)) { return SLURM_SUCCESS; } // Set the core spec information // Retry because there's a small timing window during preemption // when two core spec jobs can be running at once. for (i = 0; i < CORE_SPEC_RETRIES; i++) { if (i) { sleep(1); } errno = 0; rc = job_set_corespec(cont_id, core_count, NULL); if (rc == 0 || errno != EINVAL) { break; } } if (rc != 0) { error("job_set_corespec(%"PRIu64", %"PRIu16") failed: %m", cont_id, core_count); return SLURM_ERROR; } pid = getpid(); // Slurm detaches the slurmstepd from the job, so we temporarily // reattach so the job_set_affinity doesn't mess up one of the // task's affinity settings if (job_attachpid(pid, cont_id) == (jid_t)-1) { error("job_attachpid(%zu, %"PRIu64") failed: %m", (size_t)pid, cont_id); return SLURM_ERROR; } // Apply the core specialization with job_set_affinity // Use NONE for the cpu list because Slurm handles its // own task->cpu binding memset(&affinity_info, 0, sizeof(struct job_set_affinity_info)); affinity_info.cpu_list = JOB_AFFINITY_NONE; rc = job_set_affinity(cont_id, pid, &affinity_info); if (rc != 0) { if (affinity_info.message != NULL) { error("job_set_affinity(%"PRIu64", %zu) failed %s: %m", cont_id, (size_t)pid, affinity_info.message); free(affinity_info.message); } else { error("job_set_affinity(%"PRIu64", %zu) failed: %m", cont_id, (size_t)pid); } job_detachpid(pid); return SLURM_ERROR; } else if (affinity_info.message != NULL) { info("job_set_affinity(%"PRIu64", %zu): %s", cont_id, (size_t)pid, affinity_info.message); free(affinity_info.message); } job_detachpid(pid); #endif // The code that was here is now performed by // switch_p_job_step_{pre,post}_suspend() return SLURM_SUCCESS; }
/* NOTE: This function is called after slurmstepd spawns all user tasks. * Since the slurmstepd was placed in the job container when the container * was created and all of it's spawned tasks are placed into the container * when forked, all we need to do is remove the slurmstepd from the container * (once) at this time. */ int proctrack_p_add(stepd_step_rec_t *job, pid_t pid) { #ifdef HAVE_NATIVE_CRAY char fname[64]; int fd; #endif int count = 0; DEF_TIMERS; START_TIMER; try_again: // Attach to the job container if (job_attachpid(pid, job->cont_id) == (jid_t) -1) { if (errno == EINVAL && (count < 1)) { jid_t jid; if (proctrack_p_has_pid(job->cont_id, pid)) { debug("%s: Trying to add pid (%d) again to the same container, ignoring.", __func__, pid); return SLURM_SUCCESS; } if ((jid = job_detachpid(pid)) != (jid_t) -1) { error("%s: Pid %d was attached to container %"PRIu64" incorrectly. Moving to correct (%"PRIu64").", __func__, pid, jid, job->cont_id); count++; goto try_again; } else { error("%s: Couldn't detach pid %d from container: %m", __func__, pid); return SLURM_ERROR; } } else { error("Failed to attach pid %d to job container: %m", pid); return SLURM_ERROR; } } _end_container_thread(); #ifdef HAVE_NATIVE_CRAY // Set apid for this pid if (job_setapid(pid, SLURM_ID_HASH(job->jobid, job->stepid)) == -1) { error("Failed to set pid %d apid: %m", pid); return SLURM_ERROR; } // Explicitly mark pid as an application (/proc/<pid>/task_is_app) snprintf(fname, sizeof(fname), "/proc/%d/task_is_app", pid); fd = open(fname, O_WRONLY); if (fd == -1) { error("Failed to open %s: %m", fname); return SLURM_ERROR; } if (write(fd, "1", 1) < 1) { error("Failed to write to %s: %m", fname); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } TEMP_FAILURE_RETRY(close(fd)); #endif END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; }