static int _handle_reconfig(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; if (!_slurm_authorized_user(uid)) { debug("job step reconfigure request from uid %ld " "for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; goto done; } /* We just want to make sure the file handle is correct on a reconfigure since the file could had rolled thus making the currect fd incorrect. */ log_alter(conf->log_opts, SYSLOG_FACILITY_DAEMON, conf->logfile); debug("_handle_reconfigure for job %u.%u successful", job->jobid, job->stepid); done: /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_notify_job(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int len; char *message = NULL; debug3("_handle_notify_job for job %u.%u", job->jobid, job->stepid); safe_read(fd, &len, sizeof(int)); if (len) { message = xmalloc (len); safe_read(fd, message, len); /* '\0' terminated */ } debug3(" uid = %d", uid); if ((uid != job->uid) && !_slurm_authorized_user(uid)) { debug("notify req from uid %ld for job %u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = EPERM; goto done; } error("%s", message); xfree(message); done: /* Send the return code */ safe_write(fd, &rc, sizeof(int)); xfree(message); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_terminate(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; debug("_handle_terminate for job %u.%u", job->jobid, job->stepid); step_terminate_monitor_start(job->jobid, job->stepid); debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("terminate req from uid %ld for job %u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = -1; errnum = EPERM; goto done; } /* * Sanity checks */ if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } /* * Signal the container with SIGKILL */ pthread_mutex_lock(&suspend_mutex); if (suspended) { debug("Terminating suspended job step %u.%u", job->jobid, job->stepid); } if (slurm_container_signal(job->cont_id, SIGKILL) < 0) { rc = -1; errnum = errno; verbose("Error sending SIGKILL signal to %u.%u: %m", job->jobid, job->stepid); } else { verbose("Sent SIGKILL signal to %u.%u", job->jobid, job->stepid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errnum */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_resume(int fd, stepd_step_rec_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; debug("_handle_resume for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (!_slurm_authorized_user(uid)) { debug("job step resume request from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; goto done; } if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } acct_gather_resume_poll(); /* * Signal the container */ pthread_mutex_lock(&suspend_mutex); if (!suspended) { rc = -1; errnum = ESLURMD_STEP_NOTSUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } else { if (proctrack_g_signal(job->cont_id, SIGCONT) < 0) { verbose("Error resuming %u.%u: %m", job->jobid, job->stepid); } else { verbose("Resumed %u.%u", job->jobid, job->stepid); } suspended = false; } /* set the cpu frequencies if cpu_freq option used */ if (job->cpu_freq != NO_VAL) cpu_freq_set(job); pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_stat_jobacct(int fd, stepd_step_rec_t *job, uid_t uid) { jobacctinfo_t *jobacct = NULL; jobacctinfo_t *temp_jobacct = NULL; int i = 0; int num_tasks = 0; debug("_handle_stat_jobacct for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("stat jobacct from uid %ld for job %u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); /* Send NULL */ jobacctinfo_setinfo(jobacct, JOBACCT_DATA_PIPE, &fd, SLURM_PROTOCOL_VERSION); return SLURM_ERROR; } jobacct = jobacctinfo_create(NULL); debug3("num tasks = %d", job->node_tasks); for (i = 0; i < job->node_tasks; i++) { temp_jobacct = jobacct_gather_stat_task(job->task[i]->pid); if (temp_jobacct) { jobacctinfo_aggregate(jobacct, temp_jobacct); jobacctinfo_destroy(temp_jobacct); num_tasks++; } } jobacctinfo_setinfo(jobacct, JOBACCT_DATA_PIPE, &fd, SLURM_PROTOCOL_VERSION); safe_write(fd, &num_tasks, sizeof(int)); jobacctinfo_destroy(jobacct); return SLURM_SUCCESS; rwfail: jobacctinfo_destroy(jobacct); return SLURM_ERROR; }
/* * Should be called when a connect() to a socket returns ECONNREFUSED. * Presumably the ECONNREFUSED means that nothing is attached to the listening * side of the unix domain socket. * If the socket is at least five minutes old, go ahead an unlink it. */ static void _handle_stray_socket(const char *socket_name) { struct stat buf; uid_t uid; time_t now; /* Only attempt to remove the stale socket if process is running as root or the SlurmUser. */ if (!_slurm_authorized_user()) return; if (stat(socket_name, &buf) == -1) { debug3("_handle_stray_socket: unable to stat %s: %m", socket_name); return; } if ((uid = getuid()) != buf.st_uid) { debug3("_handle_stray_socket: socket %s is not owned by uid %d", socket_name, (int)uid); return; } now = time(NULL); if ((now-buf.st_mtime) > 300) { /* remove the socket */ if (unlink(socket_name) == -1) { if (errno != ENOENT) { error("_handle_stray_socket: unable to clean up" " stray socket %s: %m", socket_name); } } else { debug("Cleaned up stray socket %s", socket_name); } } }
static int _handle_suspend(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; debug("_handle_suspend for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (!_slurm_authorized_user(uid)) { debug("job step suspend request from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; goto done; } if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } jobacct_gather_g_suspend_poll(); /* * Signal the container */ pthread_mutex_lock(&suspend_mutex); if (suspended) { rc = -1; errnum = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } else { /* SIGTSTP is sent first to let MPI daemons stop their * tasks, then we send SIGSTOP to stop everything else */ if (slurm_container_signal(job->cont_id, SIGTSTP) < 0) { verbose("Error suspending %u.%u (SIGTSTP): %m", job->jobid, job->stepid); } else sleep(1); if (slurm_container_signal(job->cont_id, SIGSTOP) < 0) { verbose("Error suspending %u.%u (SIGSTOP): %m", job->jobid, job->stepid); } else { verbose("Suspended %u.%u", job->jobid, job->stepid); } suspended = true; } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_suspend(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; debug("_handle_suspend for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (!_slurm_authorized_user(uid)) { debug("job step suspend request from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; goto done; } if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } jobacct_gather_g_suspend_poll(); /* * Signal the container */ pthread_mutex_lock(&suspend_mutex); if (suspended) { rc = -1; errnum = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } else { /* SIGTSTP is sent first to let MPI daemons stop their tasks, * then wait 2 seconds, then send SIGSTOP to the spawned * process's container to stop everything else. * * In some cases, 1 second has proven insufficient. Longer * delays may help insure that all MPI tasks have been stopped * (that depends upon the MPI implementaiton used), but will * also permit longer time periods when more than one job can * be running on each resource (not good). */ if (slurm_container_signal(job->cont_id, SIGTSTP) < 0) { verbose("Error suspending %u.%u (SIGTSTP): %m", job->jobid, job->stepid); } else sleep(2); if (slurm_container_signal(job->cont_id, SIGSTOP) < 0) { verbose("Error suspending %u.%u (SIGSTOP): %m", job->jobid, job->stepid); } else { verbose("Suspended %u.%u", job->jobid, job->stepid); } suspended = true; } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_checkpoint_tasks(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; time_t timestamp; int len; char *image_dir = NULL; debug3("_handle_checkpoint_tasks for job %u.%u", job->jobid, job->stepid); safe_read(fd, ×tamp, sizeof(time_t)); safe_read(fd, &len, sizeof(int)); if (len) { image_dir = xmalloc (len); safe_read(fd, image_dir, len); /* '\0' terminated */ } debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("checkpoint req from uid %ld for job %u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = EPERM; goto done; } if (job->ckpt_timestamp && timestamp == job->ckpt_timestamp) { debug("duplicate checkpoint req for job %u.%u, " "timestamp %ld. discarded.", job->jobid, job->stepid, (long)timestamp); rc = ESLURM_ALREADY_DONE; /* EINPROGRESS? */ goto done; } /* * Sanity checks */ if (job->pgid <= (pid_t)1) { debug ("step %u.%u invalid [jmgr_pid:%d pgid:%u]", job->jobid, job->stepid, job->jmgr_pid, job->pgid); rc = ESLURMD_JOB_NOTRUNNING; goto done; } /* * Signal the process group */ pthread_mutex_lock(&suspend_mutex); if (suspended) { rc = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } /* set timestamp in case another request comes */ job->ckpt_timestamp = timestamp; /* TODO: do we need job->ckpt_dir any more, * except for checkpoint/xlch? */ /* if (! image_dir) { */ /* image_dir = xstrdup(job->ckpt_dir); */ /* } */ /* call the plugin to send the request */ if (checkpoint_signal_tasks(job, image_dir) != SLURM_SUCCESS) { rc = -1; verbose("Error sending checkpoint request to %u.%u: %s", job->jobid, job->stepid, slurm_strerror(rc)); } else { verbose("Sent checkpoint request to %u.%u", job->jobid, job->stepid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code */ safe_write(fd, &rc, sizeof(int)); xfree(image_dir); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_signal_container(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int sig; static int msg_sent = 0; debug("_handle_signal_container for job %u.%u", job->jobid, job->stepid); safe_read(fd, &sig, sizeof(int)); debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("kill container req from uid %ld for job %u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = -1; errnum = EPERM; goto done; } /* * Sanity checks */ if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } if ((job->nodeid == 0) && (msg_sent == 0) && (job->state < SLURMSTEPD_STEP_ENDING)) { time_t now = time(NULL); char entity[24], time_str[24]; if (job->stepid == SLURM_BATCH_SCRIPT) { snprintf(entity, sizeof(entity), "JOB %u", job->jobid); } else { snprintf(entity, sizeof(entity), "STEP %u.%u", job->jobid, job->stepid); } slurm_make_time_str(&now, time_str, sizeof(time_str)); /* Not really errors, * but we want messages displayed by default */ if (sig == SIG_TIME_LIMIT) { error("*** %s CANCELLED AT %s DUE TO TIME LIMIT ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_PREEMPTED) { error("*** %s CANCELLED AT %s DUE TO PREEMPTION ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_NODE_FAIL) { error("*** %s CANCELLED AT %s DUE TO NODE FAILURE ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_FAILURE) { error("*** %s FAILED (non-zero exit code or other " "failure mode) ***", entity); msg_sent = 1; } else if ((sig == SIGTERM) || (sig == SIGKILL)) { error("*** %s CANCELLED AT %s ***", entity, time_str); msg_sent = 1; } } if ((sig == SIG_TIME_LIMIT) || (sig == SIG_NODE_FAIL) || (sig == SIG_PREEMPTED) || (sig == SIG_FAILURE)) goto done; if (sig == SIG_DEBUG_WAKE) { int i; for (i = 0; i < job->node_tasks; i++) pdebug_wake_process(job, job->task[i]->pid); goto done; } if (sig == SIG_ABORT) { sig = SIGKILL; job->aborted = true; } pthread_mutex_lock(&suspend_mutex); if (suspended && (sig != SIGKILL)) { rc = -1; errnum = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } /* * Signal the container */ if (slurm_container_signal(job->cont_id, sig) < 0) { rc = -1; errnum = errno; verbose("Error sending signal %d to %u.%u: %m", sig, job->jobid, job->stepid); } else { verbose("Sent signal %d to %u.%u", sig, job->jobid, job->stepid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errnum */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_signal_task_local(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int signal; int ltaskid; /* local task index */ debug("_handle_signal_task_local for job %u.%u", job->jobid, job->stepid); safe_read(fd, &signal, sizeof(int)); safe_read(fd, <askid, sizeof(int)); debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("kill req from uid %ld for job %u.%u owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = EPERM; goto done; } /* * Sanity checks */ if (ltaskid < 0 || ltaskid >= job->node_tasks) { debug("step %u.%u invalid local task id %d", job->jobid, job->stepid, ltaskid); rc = SLURM_ERROR; goto done; } if (!job->task || !job->task[ltaskid]) { debug("step %u.%u no task info for task id %d", job->jobid, job->stepid, ltaskid); rc = SLURM_ERROR; goto done; } if (job->task[ltaskid]->pid <= 1) { debug("step %u.%u invalid pid %d for task %d", job->jobid, job->stepid, job->task[ltaskid]->pid, ltaskid); rc = SLURM_ERROR; goto done; } /* * Signal the task */ pthread_mutex_lock(&suspend_mutex); if (suspended) { rc = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } if (kill(job->task[ltaskid]->pid, signal) == -1) { rc = -1; verbose("Error sending signal %d to %u.%u, pid %d: %m", signal, job->jobid, job->stepid, job->task[ltaskid]->pid); } else { verbose("Sent signal %d to %u.%u, pid %d", signal, job->jobid, job->stepid, job->task[ltaskid]->pid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code */ safe_write(fd, &rc, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int signal; debug3("_handle_signal_process_group for job %u.%u", job->jobid, job->stepid); safe_read(fd, &signal, sizeof(int)); debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("kill req from uid %ld for job %u.%u owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = EPERM; goto done; } /* * Sanity checks */ if (job->pgid <= (pid_t)1) { debug ("step %u.%u invalid [jmgr_pid:%d pgid:%u]", job->jobid, job->stepid, job->jmgr_pid, job->pgid); rc = ESLURMD_JOB_NOTRUNNING; goto done; } /* * Signal the process group */ pthread_mutex_lock(&suspend_mutex); if (suspended && (signal != SIGKILL)) { rc = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } /* * Print a message in the step output before killing when * SIGTERM or SIGKILL are sent */ if ((signal == SIGTERM) || (signal == SIGKILL)) { time_t now = time(NULL); char entity[24], time_str[24]; if (job->stepid == SLURM_BATCH_SCRIPT) { snprintf(entity, sizeof(entity), "JOB %u", job->jobid); } else { snprintf(entity, sizeof(entity), "STEP %u.%u", job->jobid, job->stepid); } slurm_make_time_str(&now, time_str, sizeof(time_str)); error("*** %s KILLED AT %s WITH SIGNAL %u ***", entity, time_str, signal); } if (killpg(job->pgid, signal) == -1) { rc = -1; verbose("Error sending signal %d to %u.%u, pgid %d: %m", signal, job->jobid, job->stepid, job->pgid); } else { verbose("Sent signal %d to %u.%u, pgid %d", signal, job->jobid, job->stepid, job->pgid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code */ safe_write(fd, &rc, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_completion(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int first; int last; jobacctinfo_t *jobacct = NULL; int step_rc; debug("_handle_completion for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (!_slurm_authorized_user(uid)) { debug("step completion message from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; } safe_read(fd, &first, sizeof(int)); safe_read(fd, &last, sizeof(int)); safe_read(fd, &step_rc, sizeof(int)); jobacct = jobacct_gather_g_create(NULL); jobacct_gather_g_getinfo(jobacct, JOBACCT_DATA_PIPE, &fd); /* * Record the completed nodes */ pthread_mutex_lock(&step_complete.lock); if (! step_complete.wait_children) { rc = -1; errnum = ETIMEDOUT; /* not used anyway */ goto timeout; } /* SlurmUser or root can craft a launch without a valid credential * ("srun --no-alloc ...") and no tree information can be built * without the hostlist from the credential. */ if (step_complete.rank >= 0) { #if 0 char bits_string[128]; debug2("Setting range %d (bit %d) through %d(bit %d)", first, first-(step_complete.rank+1), last, last-(step_complete.rank+1)); bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" before bits: %s", bits_string); #endif bit_nset(step_complete.bits, first - (step_complete.rank+1), last - (step_complete.rank+1)); #if 0 bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" after bits: %s", bits_string); #endif } step_complete.step_rc = MAX(step_complete.step_rc, step_rc); /************* acct stuff ********************/ jobacct_gather_g_aggregate(step_complete.jobacct, jobacct); timeout: jobacct_gather_g_destroy(jobacct); /*********************************************/ /* Send the return code and errno, we do this within the locked * region to ensure that the stepd doesn't exit before we can * perform this send. */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); pthread_cond_signal(&step_complete.cond); pthread_mutex_unlock(&step_complete.lock); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_attach(int fd, slurmd_job_t *job, uid_t uid) { srun_info_t *srun; int rc = SLURM_SUCCESS; debug("_handle_attach for job %u.%u", job->jobid, job->stepid); srun = xmalloc(sizeof(srun_info_t)); srun->key = (srun_key_t *)xmalloc(SLURM_IO_KEY_SIZE); debug("sizeof(srun_info_t) = %d, sizeof(slurm_addr_t) = %d", (int) sizeof(srun_info_t), (int) sizeof(slurm_addr_t)); safe_read(fd, &srun->ioaddr, sizeof(slurm_addr_t)); safe_read(fd, &srun->resp_addr, sizeof(slurm_addr_t)); safe_read(fd, srun->key, SLURM_IO_KEY_SIZE); /* * Check if jobstep is actually running. */ if (job->state != SLURMSTEPD_STEP_RUNNING) { rc = ESLURMD_JOB_NOTRUNNING; goto done; } /* * At the moment, it only makes sense for the slurmd to make this * call, so only _slurm_authorized_user is allowed. */ if (!_slurm_authorized_user(uid)) { error("uid %ld attempt to attach to job %u.%u owned by %ld", (long) uid, job->jobid, job->stepid, (long)job->uid); rc = EPERM; goto done; } list_prepend(job->sruns, (void *) srun); rc = io_client_connect(srun, job); debug(" back from io_client_connect, rc = %d", rc); done: /* Send the return code */ safe_write(fd, &rc, sizeof(int)); debug(" in _handle_attach rc = %d", rc); if (rc == SLURM_SUCCESS) { /* Send response info */ uint32_t *pids, *gtids; int len, i; debug(" in _handle_attach sending response info"); len = job->node_tasks * sizeof(uint32_t); pids = xmalloc(len); gtids = xmalloc(len); if (job->task != NULL) { for (i = 0; i < job->node_tasks; i++) { if (job->task[i] == NULL) continue; pids[i] = (uint32_t)job->task[i]->pid; gtids[i] = job->task[i]->gtid; } } safe_write(fd, &job->node_tasks, sizeof(uint32_t)); safe_write(fd, pids, len); safe_write(fd, gtids, len); xfree(pids); xfree(gtids); for (i = 0; i < job->node_tasks; i++) { len = strlen(job->task[i]->argv[0]) + 1; safe_write(fd, &len, sizeof(int)); safe_write(fd, job->task[i]->argv[0], len); } } return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_terminate(int fd, stepd_step_rec_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; stepd_step_task_info_t *task; uint32_t i; debug("_handle_terminate for step=%u.%u uid=%d", job->jobid, job->stepid, uid); step_terminate_monitor_start(job->jobid, job->stepid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("terminate req from uid %ld for job %u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = -1; errnum = EPERM; goto done; } /* * Sanity checks */ if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } /* cycle thru the tasks and mark those that have not * called abort and/or terminated as killed_by_cmd */ for (i = 0; i < job->node_tasks; i++) { if (NULL == (task = job->task[i])) { continue; } if (task->aborted || task->exited) { continue; } /* mark that this task is going to be killed by * cmd so we ignore its exit status - otherwise, * we will probably report the final exit status * as SIGKILL */ task->killed_by_cmd = true; } /* * Signal the container with SIGKILL */ pthread_mutex_lock(&suspend_mutex); if (suspended) { debug("Terminating suspended job step %u.%u", job->jobid, job->stepid); } if (proctrack_g_signal(job->cont_id, SIGKILL) < 0) { rc = -1; errnum = errno; verbose("Error sending SIGKILL signal to %u.%u: %m", job->jobid, job->stepid); } else { verbose("Sent SIGKILL signal to %u.%u", job->jobid, job->stepid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errnum */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int sig; static int msg_sent = 0; char *ptr = NULL; int target_node_id = 0; stepd_step_task_info_t *task; uint32_t i; safe_read(fd, &sig, sizeof(int)); debug("_handle_signal_container for step=%u.%u uid=%d signal=%d", job->jobid, job->stepid, (int) uid, sig); if ((uid != job->uid) && !_slurm_authorized_user(uid)) { error("signal container req from uid %ld for step=%u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = -1; errnum = EPERM; goto done; } /* * Sanity checks */ if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } if ((sig == SIGTERM) || (sig == SIGKILL)) { /* cycle thru the tasks and mark those that have not * called abort and/or terminated as killed_by_cmd */ for (i = 0; i < job->node_tasks; i++) { if (NULL == (task = job->task[i])) { continue; } if (task->aborted || task->exited) { continue; } /* mark that this task is going to be killed by * cmd so we ignore its exit status - otherwise, * we will probably report the final exit status * as SIGKILL */ task->killed_by_cmd = true; } } ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID"); if (ptr) target_node_id = atoi(ptr); if ((job->nodeid == target_node_id) && (msg_sent == 0) && (job->state < SLURMSTEPD_STEP_ENDING)) { time_t now = time(NULL); char entity[24], time_str[24]; if (job->stepid == SLURM_BATCH_SCRIPT) { snprintf(entity, sizeof(entity), "JOB %u", job->jobid); } else { snprintf(entity, sizeof(entity), "STEP %u.%u", job->jobid, job->stepid); } slurm_make_time_str(&now, time_str, sizeof(time_str)); /* Not really errors, * but we want messages displayed by default */ if (sig == SIG_TIME_LIMIT) { error("*** %s CANCELLED AT %s DUE TO TIME LIMIT ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_PREEMPTED) { error("*** %s CANCELLED AT %s DUE TO PREEMPTION ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_NODE_FAIL) { error("*** %s CANCELLED AT %s DUE TO NODE FAILURE ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_FAILURE) { error("*** %s FAILED (non-zero exit code or other " "failure mode) ***", entity); msg_sent = 1; } else if ((sig == SIGTERM) || (sig == SIGKILL)) { error("*** %s CANCELLED AT %s ***", entity, time_str); msg_sent = 1; } } if ((sig == SIG_TIME_LIMIT) || (sig == SIG_NODE_FAIL) || (sig == SIG_PREEMPTED) || (sig == SIG_FAILURE)) goto done; if (sig == SIG_ABORT) { sig = SIGKILL; job->aborted = true; } pthread_mutex_lock(&suspend_mutex); if (suspended && (sig != SIGKILL)) { rc = -1; errnum = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } if (sig == SIG_DEBUG_WAKE) { int i; for (i = 0; i < job->node_tasks; i++) pdebug_wake_process(job, job->task[i]->pid); pthread_mutex_unlock(&suspend_mutex); goto done; } /* * Signal the container */ if (proctrack_g_signal(job->cont_id, sig) < 0) { rc = -1; errnum = errno; verbose("Error sending signal %d to %u.%u: %m", sig, job->jobid, job->stepid); } else { verbose("Sent signal %d to %u.%u", sig, job->jobid, job->stepid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errnum */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
static int _handle_completion(int fd, stepd_step_rec_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int first; int last; jobacctinfo_t *jobacct = NULL; int step_rc; char* buf; int len; Buf buffer; int version; /* For future use */ bool lock_set = false; debug("_handle_completion for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (!_slurm_authorized_user(uid)) { debug("step completion message from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; } safe_read(fd, &version, sizeof(int)); safe_read(fd, &first, sizeof(int)); safe_read(fd, &last, sizeof(int)); safe_read(fd, &step_rc, sizeof(int)); /* * We must not use getinfo over a pipe with slurmd here * Indeed, slurmstepd does a large use of setinfo over a pipe * with slurmd and doing the reverse can result in a deadlock * scenario with slurmd : * slurmd(lockforread,write)/slurmstepd(write,lockforread) * Do pack/unpack instead to be sure of independances of * slurmd and slurmstepd */ safe_read(fd, &len, sizeof(int)); buf = xmalloc(len); safe_read(fd, buf, len); buffer = create_buf(buf, len); jobacctinfo_unpack(&jobacct, SLURM_PROTOCOL_VERSION, PROTOCOL_TYPE_SLURM, buffer, 1); free_buf(buffer); /* * Record the completed nodes */ pthread_mutex_lock(&step_complete.lock); lock_set = true; if (! step_complete.wait_children) { rc = -1; errnum = ETIMEDOUT; /* not used anyway */ goto timeout; } /* SlurmUser or root can craft a launch without a valid credential * ("srun --no-alloc ...") and no tree information can be built * without the hostlist from the credential. */ if (step_complete.rank >= 0) { #if 0 char bits_string[128]; debug2("Setting range %d (bit %d) through %d(bit %d)", first, first-(step_complete.rank+1), last, last-(step_complete.rank+1)); bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" before bits: %s", bits_string); #endif bit_nset(step_complete.bits, first - (step_complete.rank+1), last - (step_complete.rank+1)); #if 0 bit_fmt(bits_string, sizeof(bits_string), step_complete.bits); debug2(" after bits: %s", bits_string); #endif } step_complete.step_rc = MAX(step_complete.step_rc, step_rc); /************* acct stuff ********************/ jobacctinfo_aggregate(step_complete.jobacct, jobacct); timeout: jobacctinfo_destroy(jobacct); /*********************************************/ /* Send the return code and errno, we do this within the locked * region to ensure that the stepd doesn't exit before we can * perform this send. */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); pthread_cond_signal(&step_complete.cond); pthread_mutex_unlock(&step_complete.lock); return SLURM_SUCCESS; rwfail: if (lock_set) { pthread_cond_signal(&step_complete.cond); pthread_mutex_unlock(&step_complete.lock); } return SLURM_FAILURE; }
static int _handle_suspend(int fd, stepd_step_rec_t *job, uid_t uid) { static int launch_poe = -1; int rc = SLURM_SUCCESS; int errnum = 0; uint16_t job_core_spec = (uint16_t) NO_VAL; safe_read(fd, &job_core_spec, sizeof(uint16_t)); debug("_handle_suspend for step:%u.%u uid:%ld core_spec:%u", job->jobid, job->stepid, (long)uid, job_core_spec); if (!_slurm_authorized_user(uid)) { debug("job step suspend request from uid %ld for job %u.%u ", (long)uid, job->jobid, job->stepid); rc = -1; errnum = EPERM; goto done; } if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } acct_gather_suspend_poll(); if (launch_poe == -1) { char *launch_type = slurm_get_launch_type(); if (!strcmp(launch_type, "launch/poe")) launch_poe = 1; else launch_poe = 0; xfree(launch_type); } /* * Signal the container */ pthread_mutex_lock(&suspend_mutex); if (suspended) { rc = -1; errnum = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } else { if (!job->batch && switch_g_job_step_pre_suspend(job)) error("switch_g_job_step_pre_suspend: %m"); /* SIGTSTP is sent first to let MPI daemons stop their tasks, * then wait 2 seconds, then send SIGSTOP to the spawned * process's container to stop everything else. * * In some cases, 1 second has proven insufficient. Longer * delays may help insure that all MPI tasks have been stopped * (that depends upon the MPI implementaiton used), but will * also permit longer time periods when more than one job can * be running on each resource (not good). */ if (launch_poe == 0) { /* IBM MPI seens to periodically hang upon receipt * of SIGTSTP. */ if (proctrack_g_signal(job->cont_id, SIGTSTP) < 0) { verbose("Error suspending %u.%u (SIGTSTP): %m", job->jobid, job->stepid); } else sleep(2); } if (proctrack_g_signal(job->cont_id, SIGSTOP) < 0) { verbose("Error suspending %u.%u (SIGSTOP): %m", job->jobid, job->stepid); } else { verbose("Suspended %u.%u", job->jobid, job->stepid); } suspended = true; } if (!job->batch && switch_g_job_step_post_suspend(job)) error("switch_g_job_step_post_suspend: %m"); if (!job->batch && core_spec_g_suspend(job->cont_id, job_core_spec)) error("core_spec_g_suspend: %m"); pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errno */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }