int main(int argc, char *argv[]) { log_options_t logopt = LOG_OPTS_STDERR_ONLY; job_desc_msg_t desc; resource_allocation_response_msg_t *alloc; time_t before, after; allocation_msg_thread_t *msg_thr; char **env = NULL; int status = 0; int retries = 0; pid_t pid = getpid(); pid_t tpgid = 0; pid_t rc_pid = 0; int i, rc = 0; static char *msg = "Slurm job queue full, sleeping and retrying."; slurm_allocation_callbacks_t callbacks; log_init(xbasename(argv[0]), logopt, 0, NULL); _set_exit_code(); if (spank_init_allocator() < 0) { error("Failed to initialize plugin stack"); exit(error_exit); } /* Be sure to call spank_fini when salloc exits */ if (atexit((void (*) (void)) spank_fini) < 0) error("Failed to register atexit handler for plugins: %m"); if (initialize_and_process_args(argc, argv) < 0) { error("salloc parameter parsing"); exit(error_exit); } /* reinit log with new verbosity (if changed by command line) */ if (opt.verbose || opt.quiet) { logopt.stderr_level += opt.verbose; logopt.stderr_level -= opt.quiet; logopt.prefix_level = 1; log_alter(logopt, 0, NULL); } if (spank_init_post_opt() < 0) { error("Plugin stack post-option processing failed"); exit(error_exit); } _set_spank_env(); _set_submit_dir_env(); if (opt.cwd && chdir(opt.cwd)) { error("chdir(%s): %m", opt.cwd); exit(error_exit); } if (opt.get_user_env_time >= 0) { char *user = uid_to_string(opt.uid); if (strcmp(user, "nobody") == 0) { error("Invalid user id %u: %m", (uint32_t)opt.uid); exit(error_exit); } env = env_array_user_default(user, opt.get_user_env_time, opt.get_user_env_mode); xfree(user); if (env == NULL) exit(error_exit); /* error already logged */ _set_rlimits(env); } /* * Job control for interactive salloc sessions: only if ... * * a) input is from a terminal (stdin has valid termios attributes), * b) controlling terminal exists (non-negative tpgid), * c) salloc is not run in allocation-only (--no-shell) mode, * NOTE: d and e below are configuration dependent * d) salloc runs in its own process group (true in interactive * shells that support job control), * e) salloc has been configured at compile-time to support background * execution and is not currently in the background process group. */ if (tcgetattr(STDIN_FILENO, &saved_tty_attributes) < 0) { /* * Test existence of controlling terminal (tpgid > 0) * after first making sure stdin is not redirected. */ } else if ((tpgid = tcgetpgrp(STDIN_FILENO)) < 0) { if (!opt.no_shell) { error("no controlling terminal: please set --no-shell"); exit(error_exit); } #ifdef SALLOC_RUN_FOREGROUND } else if ((!opt.no_shell) && (pid == getpgrp())) { if (tpgid == pid) is_interactive = true; while (tcgetpgrp(STDIN_FILENO) != pid) { if (!is_interactive) { error("Waiting for program to be placed in " "the foreground"); is_interactive = true; } killpg(pid, SIGTTIN); } } #else } else if ((!opt.no_shell) && (getpgrp() == tcgetpgrp(STDIN_FILENO))) {
int main(int argc, char *argv[]) { log_options_t logopt = LOG_OPTS_STDERR_ONLY; job_desc_msg_t desc; resource_allocation_response_msg_t *alloc; time_t before, after; allocation_msg_thread_t *msg_thr; char **env = NULL; int status = 0; int retries = 0; pid_t pid = getpid(); pid_t tpgid = 0; pid_t rc_pid = 0; int i, rc = 0; static char *msg = "Slurm job queue full, sleeping and retrying."; slurm_allocation_callbacks_t callbacks; log_init(xbasename(argv[0]), logopt, 0, NULL); _set_exit_code(); if (spank_init_allocator() < 0) { error("Failed to initialize plugin stack"); exit(error_exit); } /* Be sure to call spank_fini when salloc exits */ if (atexit((void (*) (void)) spank_fini) < 0) error("Failed to register atexit handler for plugins: %m"); if (initialize_and_process_args(argc, argv) < 0) { error("salloc parameter parsing"); exit(error_exit); } /* reinit log with new verbosity (if changed by command line) */ if (opt.verbose || opt.quiet) { logopt.stderr_level += opt.verbose; logopt.stderr_level -= opt.quiet; logopt.prefix_level = 1; log_alter(logopt, 0, NULL); } if (spank_init_post_opt() < 0) { error("Plugin stack post-option processing failed"); exit(error_exit); } _set_spank_env(); _set_submit_dir_env(); if (opt.cwd && chdir(opt.cwd)) { error("chdir(%s): %m", opt.cwd); exit(error_exit); } if (opt.get_user_env_time >= 0) { char *user = uid_to_string(opt.uid); if (strcmp(user, "nobody") == 0) { error("Invalid user id %u: %m", (uint32_t)opt.uid); exit(error_exit); } env = env_array_user_default(user, opt.get_user_env_time, opt.get_user_env_mode); xfree(user); if (env == NULL) exit(error_exit); /* error already logged */ _set_rlimits(env); } /* * Job control for interactive salloc sessions: only if ... * * a) input is from a terminal (stdin has valid termios attributes), * b) controlling terminal exists (non-negative tpgid), * c) salloc is not run in allocation-only (--no-shell) mode, * d) salloc runs in its own process group (true in interactive * shells that support job control), * e) salloc has been configured at compile-time to support background * execution and is not currently in the background process group. */ if (tcgetattr(STDIN_FILENO, &saved_tty_attributes) < 0) { /* * Test existence of controlling terminal (tpgid > 0) * after first making sure stdin is not redirected. */ } else if ((tpgid = tcgetpgrp(STDIN_FILENO)) < 0) { if (!opt.no_shell) { error("no controlling terminal: please set --no-shell"); exit(error_exit); } } else if ((!opt.no_shell) && (pid == getpgrp())) { if (tpgid == pid) is_interactive = true; #ifdef SALLOC_RUN_FOREGROUND while (tcgetpgrp(STDIN_FILENO) != pid) { if (!is_interactive) { error("Waiting for program to be placed in " "the foreground"); is_interactive = true; } killpg(pid, SIGTTIN); } #endif } /* * Reset saved tty attributes at exit, in case a child * process died before properly resetting terminal. */ if (is_interactive) atexit (_reset_input_mode); /* * Request a job allocation */ slurm_init_job_desc_msg(&desc); if (_fill_job_desc_from_opts(&desc) == -1) { exit(error_exit); } if (opt.gid != (gid_t) -1) { if (setgid(opt.gid) < 0) { error("setgid: %m"); exit(error_exit); } } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&desc.other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); before = time(NULL); while ((alloc = slurm_allocate_resources_blocking(&desc, opt.immediate, _pending_callback)) == NULL) { if ((errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || (retries >= MAX_RETRIES)) break; if (retries == 0) error("%s", msg); else debug("%s", msg); sleep (++retries); } /* become the user after the allocation has been requested. */ if (opt.uid != (uid_t) -1) { if (setuid(opt.uid) < 0) { error("setuid: %m"); exit(error_exit); } } if (alloc == NULL) { if (allocation_interrupted) { /* cancelled by signal */ info("Job aborted due to signal"); } else if (errno == EINTR) { error("Interrupted by signal." " Allocation request rescinded."); } else if (opt.immediate && ((errno == ETIMEDOUT) || (errno == ESLURM_NOT_TOP_PRIORITY) || (errno == ESLURM_NODES_BUSY))) { error("Unable to allocate resources: %m"); error_exit = immediate_exit; } else { error("Failed to allocate resources: %m"); } slurm_allocation_msg_thr_destroy(msg_thr); exit(error_exit); } else if (!allocation_interrupted) { /* * Allocation granted! */ info("Granted job allocation %u", alloc->job_id); pending_job_id = alloc->job_id; #ifdef HAVE_BG if (!_wait_bluegene_block_ready(alloc)) { if(!allocation_interrupted) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else if (!_wait_nodes_ready(alloc)) { if(!allocation_interrupted) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } after = time(NULL); if (opt.bell == BELL_ALWAYS || (opt.bell == BELL_AFTER_DELAY && ((after - before) > DEFAULT_BELL_DELAY))) { _ring_terminal_bell(); } if (opt.no_shell) exit(0); if (allocation_interrupted) { /* salloc process received a signal after * slurm_allocate_resources_blocking returned with the * allocation, but before the new signal handlers were * registered. */ goto relinquish; } /* * Run the user's command. */ if (env_array_for_job(&env, alloc, &desc) != SLURM_SUCCESS) goto relinquish; /* Add default task count for srun, if not already set */ if (opt.ntasks_set) { env_array_append_fmt(&env, "SLURM_NTASKS", "%d", opt.ntasks); /* keep around for old scripts */ env_array_append_fmt(&env, "SLURM_NPROCS", "%d", opt.ntasks); } if (opt.cpus_per_task > 1) { env_array_append_fmt(&env, "SLURM_CPUS_PER_TASK", "%d", opt.cpus_per_task); } if (opt.overcommit) { env_array_append_fmt(&env, "SLURM_OVERCOMMIT", "%d", opt.overcommit); } if (opt.acctg_freq >= 0) { env_array_append_fmt(&env, "SLURM_ACCTG_FREQ", "%d", opt.acctg_freq); } if (opt.network) env_array_append_fmt(&env, "SLURM_NETWORK", "%s", opt.network); env_array_set_environment(env); env_array_free(env); pthread_mutex_lock(&allocation_state_lock); if (allocation_state == REVOKED) { error("Allocation was revoked for job %u before command could " "be run", alloc->job_id); pthread_mutex_unlock(&allocation_state_lock); if (slurm_complete_job(alloc->job_id, status) != 0) { error("Unable to clean up allocation for job %u: %m", alloc->job_id); } return 1; } allocation_state = GRANTED; pthread_mutex_unlock(&allocation_state_lock); /* Ensure that salloc has initial terminal foreground control. */ if (is_interactive) { /* * Ignore remaining job-control signals (other than those in * sig_array, which at this state act like SIG_IGN). */ xsignal(SIGTSTP, SIG_IGN); xsignal(SIGTTIN, SIG_IGN); xsignal(SIGTTOU, SIG_IGN); pid = getpid(); setpgid(pid, pid); tcsetpgrp(STDIN_FILENO, pid); } command_pid = _fork_command(command_argv); /* * Wait for command to exit, OR for waitpid to be interrupted by a * signal. Either way, we are going to release the allocation next. */ if (command_pid > 0) { setpgid(command_pid, command_pid); if (is_interactive) tcsetpgrp(STDIN_FILENO, command_pid); /* NOTE: Do not process signals in separate pthread. * The signal will cause waitpid() to exit immediately. */ xsignal(SIGHUP, _exit_on_signal); /* Use WUNTRACED to treat stopped children like terminated ones */ do { rc_pid = waitpid(command_pid, &status, WUNTRACED); } while ((rc_pid == -1) && (!exit_flag)); if ((rc_pid == -1) && (errno != EINTR)) error("waitpid for %s failed: %m", command_argv[0]); } if (is_interactive) tcsetpgrp(STDIN_FILENO, pid); /* * Relinquish the job allocation (if not already revoked). */ relinquish: pthread_mutex_lock(&allocation_state_lock); if (allocation_state != REVOKED) { pthread_mutex_unlock(&allocation_state_lock); info("Relinquishing job allocation %d", alloc->job_id); if ((slurm_complete_job(alloc->job_id, status) != 0) && (slurm_get_errno() != ESLURM_ALREADY_DONE)) error("Unable to clean up job allocation %d: %m", alloc->job_id); pthread_mutex_lock(&allocation_state_lock); allocation_state = REVOKED; } pthread_mutex_unlock(&allocation_state_lock); slurm_free_resource_allocation_response_msg(alloc); slurm_allocation_msg_thr_destroy(msg_thr); /* * Figure out what return code we should use. If the user's command * exited normally, return the user's return code. */ rc = 1; if (rc_pid != -1) { if (WIFEXITED(status)) { rc = WEXITSTATUS(status); } else if (WIFSIGNALED(status)) { verbose("Command \"%s\" was terminated by signal %d", command_argv[0], WTERMSIG(status)); /* if we get these signals we return a normal * exit since this was most likely sent from the * user */ switch(WTERMSIG(status)) { case SIGHUP: case SIGINT: case SIGQUIT: case SIGKILL: rc = 0; break; default: break; } } } return rc; }