/* NOTE: This function is called after slurmstepd spawns all user tasks. * Since the slurmstepd was placed in the job container when the container * was created and all of it's spawned tasks are placed into the container * when forked, all we need to do is remove the slurmstepd from the container * (once) at this time. */ int proctrack_p_add(stepd_step_rec_t *job, pid_t pid) { #ifdef HAVE_NATIVE_CRAY char fname[64]; int fd; #endif DEF_TIMERS; START_TIMER; // Attach to the job container if (job_attachpid(pid, job->cont_id) == (jid_t) -1) { error("Failed to attach pid %d to job container: %m", pid); return SLURM_ERROR; } _end_container_thread(); #ifdef HAVE_NATIVE_CRAY // Set apid for this pid if (job_setapid(pid, SLURM_ID_HASH(job->jobid, job->stepid)) == -1) { error("Failed to set pid %d apid: %m", pid); return SLURM_ERROR; } // Explicitly mark pid as an application (/proc/<pid>/task_is_app) snprintf(fname, sizeof(fname), "/proc/%d/task_is_app", pid); fd = open(fname, O_WRONLY); if (fd == -1) { error("Failed to open %s: %m", fname); return SLURM_ERROR; } if (write(fd, "1", 1) < 1) { error("Failed to write to %s: %m", fname); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } TEMP_FAILURE_RETRY(close(fd)); #endif END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; }
/* NOTE: This function is called after slurmstepd spawns all user tasks. * Since the slurmstepd was placed in the job container when the container * was created and all of it's spawned tasks are placed into the container * when forked, all we need to do is remove the slurmstepd from the container * (once) at this time. */ int proctrack_p_add(stepd_step_rec_t *job, pid_t pid) { #ifdef HAVE_NATIVE_CRAY char fname[64]; int fd; #endif int count = 0; DEF_TIMERS; START_TIMER; try_again: // Attach to the job container if (job_attachpid(pid, job->cont_id) == (jid_t) -1) { if (errno == EINVAL && (count < 1)) { jid_t jid; if (proctrack_p_has_pid(job->cont_id, pid)) { debug("%s: Trying to add pid (%d) again to the same container, ignoring.", __func__, pid); return SLURM_SUCCESS; } if ((jid = job_detachpid(pid)) != (jid_t) -1) { error("%s: Pid %d was attached to container %"PRIu64" incorrectly. Moving to correct (%"PRIu64").", __func__, pid, jid, job->cont_id); count++; goto try_again; } else { error("%s: Couldn't detach pid %d from container: %m", __func__, pid); return SLURM_ERROR; } } else { error("Failed to attach pid %d to job container: %m", pid); return SLURM_ERROR; } } _end_container_thread(); #ifdef HAVE_NATIVE_CRAY // Set apid for this pid if (job_setapid(pid, SLURM_ID_HASH(job->jobid, job->stepid)) == -1) { error("Failed to set pid %d apid: %m", pid); return SLURM_ERROR; } // Explicitly mark pid as an application (/proc/<pid>/task_is_app) snprintf(fname, sizeof(fname), "/proc/%d/task_is_app", pid); fd = open(fname, O_WRONLY); if (fd == -1) { error("Failed to open %s: %m", fname); return SLURM_ERROR; } if (write(fd, "1", 1) < 1) { error("Failed to write to %s: %m", fname); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } TEMP_FAILURE_RETRY(close(fd)); #endif END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; }