static int _handle_signal_container(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int sig; static int msg_sent = 0; debug("_handle_signal_container for job %u.%u", job->jobid, job->stepid); safe_read(fd, &sig, sizeof(int)); debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("kill container req from uid %ld for job %u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = -1; errnum = EPERM; goto done; } /* * Sanity checks */ if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } if ((job->nodeid == 0) && (msg_sent == 0) && (job->state < SLURMSTEPD_STEP_ENDING)) { time_t now = time(NULL); char entity[24], time_str[24]; if (job->stepid == SLURM_BATCH_SCRIPT) { snprintf(entity, sizeof(entity), "JOB %u", job->jobid); } else { snprintf(entity, sizeof(entity), "STEP %u.%u", job->jobid, job->stepid); } slurm_make_time_str(&now, time_str, sizeof(time_str)); /* Not really errors, * but we want messages displayed by default */ if (sig == SIG_TIME_LIMIT) { error("*** %s CANCELLED AT %s DUE TO TIME LIMIT ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_PREEMPTED) { error("*** %s CANCELLED AT %s DUE TO PREEMPTION ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_NODE_FAIL) { error("*** %s CANCELLED AT %s DUE TO NODE FAILURE ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_FAILURE) { error("*** %s FAILED (non-zero exit code or other " "failure mode) ***", entity); msg_sent = 1; } else if ((sig == SIGTERM) || (sig == SIGKILL)) { error("*** %s CANCELLED AT %s ***", entity, time_str); msg_sent = 1; } } if ((sig == SIG_TIME_LIMIT) || (sig == SIG_NODE_FAIL) || (sig == SIG_PREEMPTED) || (sig == SIG_FAILURE)) goto done; if (sig == SIG_DEBUG_WAKE) { int i; for (i = 0; i < job->node_tasks; i++) pdebug_wake_process(job, job->task[i]->pid); goto done; } if (sig == SIG_ABORT) { sig = SIGKILL; job->aborted = true; } pthread_mutex_lock(&suspend_mutex); if (suspended && (sig != SIGKILL)) { rc = -1; errnum = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } /* * Signal the container */ if (slurm_container_signal(job->cont_id, sig) < 0) { rc = -1; errnum = errno; verbose("Error sending signal %d to %u.%u: %m", sig, job->jobid, job->stepid); } else { verbose("Sent signal %d to %u.%u", sig, job->jobid, job->stepid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errnum */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int sig; static int msg_sent = 0; char *ptr = NULL; int target_node_id = 0; stepd_step_task_info_t *task; uint32_t i; safe_read(fd, &sig, sizeof(int)); debug("_handle_signal_container for step=%u.%u uid=%d signal=%d", job->jobid, job->stepid, (int) uid, sig); if ((uid != job->uid) && !_slurm_authorized_user(uid)) { error("signal container req from uid %ld for step=%u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = -1; errnum = EPERM; goto done; } /* * Sanity checks */ if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } if ((sig == SIGTERM) || (sig == SIGKILL)) { /* cycle thru the tasks and mark those that have not * called abort and/or terminated as killed_by_cmd */ for (i = 0; i < job->node_tasks; i++) { if (NULL == (task = job->task[i])) { continue; } if (task->aborted || task->exited) { continue; } /* mark that this task is going to be killed by * cmd so we ignore its exit status - otherwise, * we will probably report the final exit status * as SIGKILL */ task->killed_by_cmd = true; } } ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID"); if (ptr) target_node_id = atoi(ptr); if ((job->nodeid == target_node_id) && (msg_sent == 0) && (job->state < SLURMSTEPD_STEP_ENDING)) { time_t now = time(NULL); char entity[24], time_str[24]; if (job->stepid == SLURM_BATCH_SCRIPT) { snprintf(entity, sizeof(entity), "JOB %u", job->jobid); } else { snprintf(entity, sizeof(entity), "STEP %u.%u", job->jobid, job->stepid); } slurm_make_time_str(&now, time_str, sizeof(time_str)); /* Not really errors, * but we want messages displayed by default */ if (sig == SIG_TIME_LIMIT) { error("*** %s CANCELLED AT %s DUE TO TIME LIMIT ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_PREEMPTED) { error("*** %s CANCELLED AT %s DUE TO PREEMPTION ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_NODE_FAIL) { error("*** %s CANCELLED AT %s DUE TO NODE FAILURE ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_FAILURE) { error("*** %s FAILED (non-zero exit code or other " "failure mode) ***", entity); msg_sent = 1; } else if ((sig == SIGTERM) || (sig == SIGKILL)) { error("*** %s CANCELLED AT %s ***", entity, time_str); msg_sent = 1; } } if ((sig == SIG_TIME_LIMIT) || (sig == SIG_NODE_FAIL) || (sig == SIG_PREEMPTED) || (sig == SIG_FAILURE)) goto done; if (sig == SIG_ABORT) { sig = SIGKILL; job->aborted = true; } pthread_mutex_lock(&suspend_mutex); if (suspended && (sig != SIGKILL)) { rc = -1; errnum = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } if (sig == SIG_DEBUG_WAKE) { int i; for (i = 0; i < job->node_tasks; i++) pdebug_wake_process(job, job->task[i]->pid); pthread_mutex_unlock(&suspend_mutex); goto done; } /* * Signal the container */ if (proctrack_g_signal(job->cont_id, sig) < 0) { rc = -1; errnum = errno; verbose("Error sending signal %d to %u.%u: %m", sig, job->jobid, job->stepid); } else { verbose("Sent signal %d to %u.%u", sig, job->jobid, job->stepid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errnum */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }