static void *_monitor(void *arg) { stepd_step_rec_t *job = (stepd_step_rec_t *)arg; struct timespec ts = {0, 0}; int rc; debug2("step_terminate_monitor will run for %d secs", timeout); ts.tv_sec = time(NULL) + 1 + timeout; slurm_mutex_lock(&lock); if (stop_flag) goto done; rc = pthread_cond_timedwait(&cond, &lock, &ts); if (rc == ETIMEDOUT) { char entity[24], time_str[24]; time_t now = time(NULL); int rc; _call_external_program(job); if (job->stepid == SLURM_BATCH_SCRIPT) { snprintf(entity, sizeof(entity), "JOB %u", job->jobid); } else if (job->stepid == SLURM_EXTERN_CONT) { snprintf(entity, sizeof(entity), "EXTERN STEP FOR %u", job->jobid); } else { snprintf(entity, sizeof(entity), "STEP %u.%u", job->jobid, job->stepid); } slurm_make_time_str(&now, time_str, sizeof(time_str)); if (job->state < SLURMSTEPD_STEP_RUNNING) { error("*** %s STEPD TERMINATED ON %s AT %s DUE TO JOB NOT RUNNING ***", entity, job->node_name, time_str); rc = ESLURMD_JOB_NOTRUNNING; } else { error("*** %s STEPD TERMINATED ON %s AT %s DUE TO JOB NOT ENDING WITH SIGNALS ***", entity, job->node_name, time_str); rc = ESLURMD_KILL_TASK_FAILED; } exit(stepd_cleanup(NULL, job, NULL, NULL, rc, 0)); } else if (rc != 0) { error("Error waiting on condition in _monitor: %m"); } done: slurm_mutex_unlock(&lock); debug2("step_terminate_monitor is stopping"); return NULL; }
int main (int argc, char **argv) { slurm_addr_t *cli; slurm_addr_t *self; slurm_msg_t *msg; stepd_step_rec_t *job; int ngids; gid_t *gids; int rc = 0; char *launch_params; if (_process_cmdline (argc, argv) < 0) fatal ("Error in slurmstepd command line"); xsignal_block(slurmstepd_blocked_signals); conf = xmalloc(sizeof(*conf)); conf->argv = &argv; conf->argc = &argc; init_setproctitle(argc, argv); if (slurm_select_init(1) != SLURM_SUCCESS ) fatal( "failed to initialize node selection plugin" ); if (slurm_auth_init(NULL) != SLURM_SUCCESS) fatal( "failed to initialize authentication plugin" ); /* Receive job parameters from the slurmd */ _init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg, &ngids, &gids); /* Create the stepd_step_rec_t, mostly from info in a * launch_tasks_request_msg_t or a batch_job_launch_msg_t */ if (!(job = _step_setup(cli, self, msg))) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } job->ngids = ngids; job->gids = gids; /* fork handlers cause mutexes on some global data structures * to be re-initialized after the fork. */ list_install_fork_handlers(); slurm_conf_install_fork_handlers(); /* sets job->msg_handle and job->msgid */ if (msg_thr_create(job) == SLURM_ERROR) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } _send_ok_to_slurmd(STDOUT_FILENO); _got_ack_from_slurmd(STDIN_FILENO); /* Fancy way of closing stdin that keeps STDIN_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDIN_FILENO); /* Fancy way of closing stdout that keeps STDOUT_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDOUT_FILENO); /* slurmstepd is the only daemon that should survive upgrade. If it * had been swapped out before upgrade happened it could easily lead * to SIGBUS at any time after upgrade. Avoid that by locking it * in-memory. */ launch_params = slurm_get_launch_params(); if (launch_params && strstr(launch_params, "slurmstepd_memlock")) { #ifdef _POSIX_MEMLOCK int flags = MCL_CURRENT; if (strstr(launch_params, "slurmstepd_memlock_all")) flags |= MCL_FUTURE; if (mlockall(flags) < 0) info("failed to mlock() slurmstepd pages: %m"); else debug("slurmstepd locked in memory"); #else info("mlockall() system call does not appear to be available"); #endif } xfree(launch_params); /* This does most of the stdio setup, then launches all the tasks, * and blocks until the step is complete */ rc = job_manager(job); return stepd_cleanup(msg, job, cli, self, rc, 0); ending: return stepd_cleanup(msg, job, cli, self, rc, 1); }