/* returns the pid of the forked command, or <0 on error */ static pid_t _fork_command(char **command) { pid_t pid; pid = fork(); if (pid < 0) { error("fork failed: %m"); } else if (pid == 0) { /* child */ setpgid(getpid(), 0); /* * Reset job control signals. * Suspend (TSTP) is not restored (ignored, as in the parent): * shells with job-control override this and look after their * processes. * Suspending single commands is more complex and would require * adding full shell-like job control to salloc. */ xsignal(SIGINT, SIG_DFL); xsignal(SIGQUIT, SIG_DFL); xsignal(SIGTTIN, SIG_DFL); xsignal(SIGTTOU, SIG_DFL); execvp(command[0], command); /* should only get here if execvp failed */ error("Unable to exec command \"%s\"", command[0]); exit(error_exit); } /* parent returns */ return pid; }
ssize_t _slurm_msg_sendto_timeout(slurm_fd_t fd, char *buffer, size_t size, uint32_t flags, int timeout) { int len; uint32_t usize; SigFunc *ohandler; /* * Ignore SIGPIPE so that send can return a error code if the * other side closes the socket */ ohandler = xsignal(SIGPIPE, SIG_IGN); usize = htonl(size); if ((len = _slurm_send_timeout( fd, (char *)&usize, sizeof(usize), 0, timeout)) < 0) goto done; if ((len = _slurm_send_timeout(fd, buffer, size, 0, timeout)) < 0) goto done; done: xsignal(SIGPIPE, ohandler); return len; }
void init_main(void) { struct sigaction sig_act; if (getpid() != 1) error_exit("Already running"); printf("Started init\n"); initialize_console(); reset_term(0); if (chdir("/")) perror_exit("Can't cd to /"); setsid(); putenv("HOME=/"); putenv("PATH=/sbin:/usr/sbin:/bin:/usr/bin"); putenv("SHELL=/bin/sh"); putenv("USER=root"); inittab_parsing(); xsignal(SIGUSR1, halt_poweroff_reboot_handler);//halt xsignal(SIGUSR2, halt_poweroff_reboot_handler);//poweroff xsignal(SIGTERM, halt_poweroff_reboot_handler);//reboot xsignal(SIGQUIT, restart_init_handler);//restart init memset(&sig_act, 0, sizeof(sig_act)); sigfillset(&sig_act.sa_mask); sigdelset(&sig_act.sa_mask, SIGCONT); sig_act.sa_handler = pause_handler; sigaction(SIGTSTP, &sig_act, NULL); memset(&sig_act, 0, sizeof(sig_act)); sig_act.sa_handler = catch_signal; sigaction(SIGINT, &sig_act, NULL); sigaction(SIGHUP, &sig_act, NULL); run_action_from_list(SYSINIT); check_if_pending_signals(); run_action_from_list(WAIT); check_if_pending_signals(); run_action_from_list(ONCE); while (1) { int suspected_WNOHANG = check_if_pending_signals(); run_action_from_list(RESPAWN | ASKFIRST); suspected_WNOHANG = suspected_WNOHANG|check_if_pending_signals(); sleep(1);//let cpu breath suspected_WNOHANG = suspected_WNOHANG|check_if_pending_signals(); if (suspected_WNOHANG) suspected_WNOHANG=WNOHANG; while(1) { pid_t pid = waitpid(-1, NULL, suspected_WNOHANG); if (pid <= 0) break; mark_as_terminated_process(pid); suspected_WNOHANG = WNOHANG; } } }
static void _unblock_signals(void) { sigset_t set; int i; for (i = 0; sig_array[i]; i++) { /* eliminate pending signals, then set to default */ xsignal(sig_array[i], SIG_IGN); xsignal(sig_array[i], SIG_DFL); } sigemptyset(&set); xsignal_set_mask (&set); }
static void *_pty_thread(void *arg) { int fd = -1; srun_job_t *job = (srun_job_t *) arg; slurm_addr_t client_addr; xsignal_unblock(pty_sigarray); xsignal(SIGWINCH, _handle_sigwinch); if ((fd = slurm_accept_msg_conn(job->pty_fd, &client_addr)) < 0) { error("pty: accept failure: %m"); return NULL; } while (job->state <= SRUN_JOB_RUNNING) { debug2("waiting for SIGWINCH"); poll(NULL, 0, -1); if (winch) { set_winsize(job); _notify_winsize_change(fd, job); } winch = 0; } return NULL; }
static _Noreturn void json_parse_error (const json_error_t *error) { Lisp_Object symbol; #if JSON_HAS_ERROR_CODE switch (json_error_code (error)) { case json_error_premature_end_of_input: symbol = Qjson_end_of_file; break; case json_error_end_of_input_expected: symbol = Qjson_trailing_content; break; default: symbol = Qjson_parse_error; break; } #else if (json_has_suffix (error->text, "expected near end of file")) symbol = Qjson_end_of_file; else if (json_has_prefix (error->text, "end of file expected")) symbol = Qjson_trailing_content; else symbol = Qjson_parse_error; #endif xsignal (symbol, list5 (json_build_string (error->text), json_build_string (error->source), make_natnum (error->line), make_natnum (error->column), make_natnum (error->position))); }
extern void slurm_persist_conn_recv_server_init(void) { int sigarray[] = {SIGUSR1, 0}; shutdown_time = 0; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); /* Prepare to catch SIGUSR1 to interrupt accept(). * This signal is generated by the slurmdbd signal * handler thread upon receipt of SIGABRT, SIGINT, * or SIGTERM. That thread does all processing of * all signals. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); }
void unexec (const char *new_name, const char *old_name) { Lisp_Object data; Lisp_Object errstring; if (! dldump (0, new_name, RTLD_MEMORY)) return; data = Fcons (build_string (new_name), Qnil); synchronize_system_messages_locale (); errstring = code_convert_string_norecord (build_string (dlerror ()), Vlocale_coding_system, 0); xsignal (Qfile_error, Fcons (build_string ("Cannot unexec"), Fcons (errstring, data))); }
static void pause_handler(int sig_no) { int signal_backup,errno_backup; pid_t pid; errno_backup = errno; signal_backup = caught_signal; xsignal(SIGCONT, catch_signal); while(1) { if (caught_signal == SIGCONT) break; do pid = waitpid(-1,NULL,WNOHANG); while((pid==-1) && (errno=EINTR)); mark_as_terminated_process(pid); sleep(1); } signal(SIGCONT, SIG_DFL); errno = errno_backup; caught_signal = signal_backup; }
void nohup_main(void) { xsignal(SIGHUP, SIG_IGN); if (isatty(1)) { close(1); if (-1 == open("nohup.out", O_CREAT|O_APPEND|O_WRONLY, S_IRUSR|S_IWUSR )) { char *temp = getenv("HOME"); temp = xmprintf("%s/%s", temp ? temp : "", "nohup.out"); xcreate(temp, O_CREAT|O_APPEND|O_WRONLY, 0600); free(temp); } } if (isatty(0)) { close(0); xopen_stdio("/dev/null", O_RDONLY); } xexec(toys.optargs); }
extern int create_job_step(srun_job_t *job, bool use_all_cpus) { int i, rc; unsigned long my_sleep = 0; time_t begin_time; slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* set the jobid for totalview */ totalview_jobid = NULL; xstrfmtcat(totalview_jobid, "%u", job->ctx_params.job_id); /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return -1; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return -1; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.mem_per_cpu = opt.mem_per_cpu; job->ctx_params.gres = opt.gres; if (use_all_cpus) job->ctx_params.cpu_count = job->cpu_count; else if (opt.overcommit) job->ctx_params.cpu_count = job->ctx_params.min_nodes; else job->ctx_params.cpu_count = opt.ntasks*opt.cpus_per_task; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.gres = opt.gres; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; switch (opt.distribution) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: job->ctx_params.task_dist = opt.distribution; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: job->ctx_params.task_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution = job->ctx_params.task_dist; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!destroy_job); i++) { if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else job->step_ctx = slurm_step_ctx_create( &job->ctx_params); if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return -1; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); my_sleep = MIN((my_sleep * 2), 29000000); } /* sleep 0.1 to 29 secs with exponential back-off */ usleep(my_sleep); if (destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (destroy_job) { info("Cancelled pending job step"); return -1; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return 0; }
resource_allocation_response_msg_t * allocate_nodes(void) { resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j = job_desc_msg_create_from_opts(); slurm_allocation_callbacks_t callbacks; int i; if (!j) return NULL; /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt.jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt.jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); while (!resp) { resp = slurm_allocate_resources_blocking(j, opt.immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; #ifdef HAVE_BG if (!_wait_bluegene_block_ready(resp)) { if(!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else if (!_wait_nodes_ready(resp)) { if(!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: slurm_free_resource_allocation_response_msg(resp); if (!destroy_job) slurm_complete_job(resp->job_id, 1); exit(error_exit); return NULL; }
resource_allocation_response_msg_t * allocate_nodes(bool handle_signals) { resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j = job_desc_msg_create_from_opts(); slurm_allocation_callbacks_t callbacks; int i; if (!j) return NULL; /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt.jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt.jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (!resp) { resp = slurm_allocate_resources_blocking(j, opt.immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. */ if (opt.pn_min_memory != NO_VAL) opt.pn_min_memory = (resp->pn_min_memory & (~MEM_PER_CPU)); else if (opt.mem_per_cpu != NO_VAL) opt.mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); /* * FIXME: timelimit should probably also be updated * here since it could also change. */ #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt.min_nodes = node_cnt; opt.max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt.min_nodes = resp->node_cnt; opt.max_nodes = resp->node_cnt; if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: if (resp) { if (!destroy_job) slurm_complete_job(resp->job_id, 1); slurm_free_resource_allocation_response_msg(resp); } exit(error_exit); return NULL; }
/* * Allocate nodes for heterogeneous/pack job from the slurm controller -- * retrying the attempt if the controller appears to be down, and optionally * waiting for resources if none are currently available (see opt.immediate) * * Returns a pointer to a resource_allocation_response_msg which must * be freed with slurm_free_resource_allocation_response_msg() */ List allocate_pack_nodes(bool handle_signals) { resource_allocation_response_msg_t *resp = NULL; bool jobid_log = true; job_desc_msg_t *j, *first_job = NULL; slurm_allocation_callbacks_t callbacks; ListIterator opt_iter, resp_iter; slurm_opt_t *opt_local, *first_opt = NULL; List job_req_list = NULL, job_resp_list = NULL; uint32_t my_job_id = 0; int i, k; job_req_list = list_create(NULL); opt_iter = list_iterator_create(opt_list); while ((opt_local = list_next(opt_iter))) { srun_opt_t *srun_opt = opt_local->srun_opt; xassert(srun_opt); if (!first_opt) first_opt = opt_local; if (srun_opt->relative_set && srun_opt->relative) fatal("--relative option invalid for job allocation request"); if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL) return NULL; if (!first_job) first_job = j; j->origin_cluster = xstrdup(slurmctld_conf.cluster_name); /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt_local->jobid_set) { if (jobid_log) { jobid_log = false; /* log once */ info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); } if (!opt_local->jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } list_append(job_req_list, j); } list_iterator_destroy(opt_iter); if (!first_job) { error("%s: No job requests found", __func__); return NULL; } if (first_opt && first_opt->clusters && (slurmdb_get_first_pack_cluster(job_req_list, first_opt->clusters, &working_cluster_rec) != SLURM_SUCCESS)) { print_db_notok(first_opt->clusters, 0); return NULL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (first_opt && !job_resp_list) { job_resp_list = slurm_allocate_pack_job_blocking(job_req_list, first_opt->immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!job_resp_list && !_retry()) { break; } } if (job_resp_list && !destroy_job) { /* * Allocation granted! */ opt_iter = list_iterator_create(opt_list); resp_iter = list_iterator_create(job_resp_list); while ((opt_local = list_next(opt_iter))) { resp = (resource_allocation_response_msg_t *) list_next(resp_iter); if (!resp) break; if (pending_job_id == 0) pending_job_id = resp->job_id; if (my_job_id == 0) { my_job_id = resp->job_id; i = list_count(opt_list); k = list_count(job_resp_list); if (i != k) { error("%s: request count != response count (%d != %d)", __func__, i, k); goto relinquish; } } /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. * * NOTE: pn_min_memory here is an int64, not uint64. * These operations may have some bizarre side effects */ if (opt_local->pn_min_memory != NO_VAL64) opt_local->pn_min_memory = (resp->pn_min_memory & (~MEM_PER_CPU)); else if (opt_local->mem_per_cpu != NO_VAL64) opt_local->mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt_local->min_nodes = node_cnt; opt_local->max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt_local->min_nodes = resp->node_cnt; opt_local->max_nodes = resp->node_cnt; if (resp->working_cluster_rec) slurm_setup_remote_working_cluster(resp); if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } list_iterator_destroy(resp_iter); list_iterator_destroy(opt_iter); } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); return job_resp_list; relinquish: if (job_resp_list) { if (!destroy_job && my_job_id) slurm_complete_job(my_job_id, 1); list_destroy(job_resp_list); } exit(error_exit); return NULL; }
static void *_agent(void *x) { int cnt, rc; Buf buffer; struct timespec abs_time; static time_t fail_time = 0; int sigarray[] = {SIGUSR1, 0}; slurmdbd_msg_t list_req; dbd_list_msg_t list_msg; list_req.msg_type = DBD_SEND_MULT_MSG; list_req.data = &list_msg; memset(&list_msg, 0, sizeof(dbd_list_msg_t)); /* DEF_TIMERS; */ /* Prepare to catch SIGUSR1 to interrupt pending * I/O and terminate in a timely fashion. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); while (*slurmdbd_conn->shutdown == 0) { /* START_TIMER; */ slurm_mutex_lock(&slurmdbd_lock); if (halt_agent) slurm_cond_wait(&slurmdbd_cond, &slurmdbd_lock); if ((slurmdbd_conn->fd < 0) && (difftime(time(NULL), fail_time) >= 10)) { /* The connection to Slurm DBD is not open */ _open_slurmdbd_conn(1); if (slurmdbd_conn->fd < 0) fail_time = time(NULL); } slurm_mutex_lock(&agent_lock); if (agent_list && slurmdbd_conn->fd) cnt = list_count(agent_list); else cnt = 0; if ((cnt == 0) || (slurmdbd_conn->fd < 0) || (fail_time && (difftime(time(NULL), fail_time) < 10))) { slurm_mutex_unlock(&slurmdbd_lock); abs_time.tv_sec = time(NULL) + 10; abs_time.tv_nsec = 0; slurm_cond_timedwait(&agent_cond, &agent_lock, &abs_time); slurm_mutex_unlock(&agent_lock); continue; } else if ((cnt > 0) && ((cnt % 100) == 0)) info("slurmdbd: agent queue size %u", cnt); /* Leave item on the queue until processing complete */ if (agent_list) { int handle_agent_count = 1000; if (cnt > handle_agent_count) { int agent_count = 0; ListIterator agent_itr = list_iterator_create(agent_list); list_msg.my_list = list_create(NULL); while ((buffer = list_next(agent_itr))) { list_enqueue(list_msg.my_list, buffer); agent_count++; if (agent_count > handle_agent_count) break; } list_iterator_destroy(agent_itr); buffer = pack_slurmdbd_msg( &list_req, SLURM_PROTOCOL_VERSION); } else if (cnt > 1) { list_msg.my_list = agent_list; buffer = pack_slurmdbd_msg( &list_req, SLURM_PROTOCOL_VERSION); } else buffer = (Buf) list_peek(agent_list); } else buffer = NULL; slurm_mutex_unlock(&agent_lock); if (buffer == NULL) { slurm_mutex_unlock(&slurmdbd_lock); slurm_mutex_lock(&assoc_cache_mutex); if (slurmdbd_conn->fd >= 0 && running_cache) slurm_cond_signal(&assoc_cache_cond); slurm_mutex_unlock(&assoc_cache_mutex); continue; } /* NOTE: agent_lock is clear here, so we can add more * requests to the queue while waiting for this RPC to * complete. */ rc = slurm_persist_send_msg(slurmdbd_conn, buffer); if (rc != SLURM_SUCCESS) { if (*slurmdbd_conn->shutdown) { slurm_mutex_unlock(&slurmdbd_lock); break; } error("slurmdbd: Failure sending message: %d: %m", rc); } else if (list_msg.my_list) { rc = _handle_mult_rc_ret(); } else { rc = _get_return_code(); if (rc == EAGAIN) { if (*slurmdbd_conn->shutdown) { slurm_mutex_unlock(&slurmdbd_lock); break; } error("slurmdbd: Failure with " "message need to resend: %d: %m", rc); } } slurm_mutex_unlock(&slurmdbd_lock); slurm_mutex_lock(&assoc_cache_mutex); if (slurmdbd_conn->fd >= 0 && running_cache) slurm_cond_signal(&assoc_cache_cond); slurm_mutex_unlock(&assoc_cache_mutex); slurm_mutex_lock(&agent_lock); if (agent_list && (rc == SLURM_SUCCESS)) { /* * If we sent a mult_msg we just need to free buffer, * we don't need to requeue, just mark list_msg.my_list * as NULL as that is the sign we sent a mult_msg. */ if (list_msg.my_list) { if (list_msg.my_list != agent_list) FREE_NULL_LIST(list_msg.my_list); list_msg.my_list = NULL; } else buffer = (Buf) list_dequeue(agent_list); free_buf(buffer); fail_time = 0; } else { /* We need to free a mult_msg even on failure */ if (list_msg.my_list) { if (list_msg.my_list != agent_list) FREE_NULL_LIST(list_msg.my_list); list_msg.my_list = NULL; free_buf(buffer); } fail_time = time(NULL); } slurm_mutex_unlock(&agent_lock); /* END_TIMER; */ /* info("at the end with %s", TIME_STR); */ } slurm_mutex_lock(&agent_lock); _save_dbd_state(); FREE_NULL_LIST(agent_list); slurm_mutex_unlock(&agent_lock); return NULL; }
/* Send commands requested in params to the server. */ static void server_command (struct parameters *params, lists_t_strs *args) { int sock; if ((sock = server_connect()) == -1) fatal ("The server is not running!"); xsignal (SIGPIPE, SIG_IGN); if (!ping_server (sock)) fatal ("Can't connect to the server!"); if (params->playit) interface_cmdline_playit (sock, args); if (params->clear) interface_cmdline_clear_plist (sock); if (params->append) interface_cmdline_append (sock, args); if (params->enqueue) interface_cmdline_enqueue (sock, args); if (params->play) interface_cmdline_play_first (sock); if (params->get_file_info) interface_cmdline_file_info (sock); if (params->seek_by) interface_cmdline_seek_by (sock, params->seek_by); if (params->jump_type=='%') interface_cmdline_jump_to_percent (sock,params->jump_to); if (params->jump_type=='s') interface_cmdline_jump_to (sock,params->jump_to); if (params->get_formatted_info) interface_cmdline_formatted_info (sock, params->formatted_info_param); if (params->adj_volume) interface_cmdline_adj_volume (sock, params->adj_volume); if (params->toggle) interface_cmdline_set (sock, params->toggle, 2); if (params->on) interface_cmdline_set (sock, params->on, 1); if (params->off) interface_cmdline_set (sock, params->off, 0); if (params->exit) { if (!send_int(sock, CMD_QUIT)) fatal ("Can't send command!"); } else if (params->stop) { if (!send_int(sock, CMD_STOP) || !send_int(sock, CMD_DISCONNECT)) fatal ("Can't send commands!"); } else if (params->pause) { if (!send_int(sock, CMD_PAUSE) || !send_int(sock, CMD_DISCONNECT)) fatal ("Can't send commands!"); } else if (params->next) { if (!send_int(sock, CMD_NEXT) || !send_int(sock, CMD_DISCONNECT)) fatal ("Can't send commands!"); } else if (params->previous) { if (!send_int(sock, CMD_PREV) || !send_int(sock, CMD_DISCONNECT)) fatal ("Can't send commands!"); } else if (params->unpause) { if (!send_int(sock, CMD_UNPAUSE) || !send_int(sock, CMD_DISCONNECT)) fatal ("Can't send commands!"); } else if (params->toggle_pause) { int state, ev, cmd = -1; if (!send_int(sock, CMD_GET_STATE)) fatal ("Can't send commands!"); if (!get_int(sock, &ev) || ev != EV_DATA || !get_int(sock, &state)) fatal ("Can't get data from the server!"); if (state == STATE_PAUSE) cmd = CMD_UNPAUSE; else if (state == STATE_PLAY) cmd = CMD_PAUSE; if (cmd != -1 && !send_int(sock, cmd)) fatal ("Can't send commands!"); if (!send_int(sock, CMD_DISCONNECT)) fatal ("Can't send commands!"); } close (sock); }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job) { int i, rc; unsigned long step_wait = 0, my_sleep = 0; time_t begin_time; uint16_t base_dist; if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU; else if (opt.pn_min_memory != NO_VAL) job->ctx_params.pn_min_memory = opt.pn_min_memory; if (opt.gres) job->ctx_params.gres = opt.gres; else job->ctx_params.gres = getenv("SLURM_STEP_GRES"); if (opt.overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt.cpus_set) { job->ctx_params.cpu_count = opt.ntasks * opt.cpus_per_task; } else if (opt.ntasks_set) { job->ctx_params.cpu_count = opt.ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt.ntasks; } job->ctx_params.cpu_freq_min = opt.cpu_freq_min; job->ctx_params.cpu_freq_max = opt.cpu_freq_max; job->ctx_params.cpu_freq_gov = opt.cpu_freq_gov; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (opt.multi_prog ? opt.multi_prog_cmds + 1 : 1); #endif } switch (opt.distribution & SLURM_DIST_STATE_BASE) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution &= SLURM_DIST_STATE_FLAGS; opt.distribution |= base_dist; job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; job->ctx_params.features = opt.constraints; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!(*destroy_job)); i++) { bool blocking_step_create = true; if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else if (opt.immediate) { job->step_ctx = slurm_step_ctx_create( &job->ctx_params); } else { /* Wait 60 to 70 seconds for response */ step_wait = (getpid() % 10) * 1000 + 60000; job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_INTERCONNECT_BUSY) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return SLURM_ERROR; } if (rc == ESLURM_DISABLED) /* job suspended */ blocking_step_create = false; if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], signal_function); if (!blocking_step_create) my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); if (!blocking_step_create) my_sleep *= 2; } if (!blocking_step_create) { /* sleep 0.1 to 29 secs with exponential back-off */ my_sleep = MIN(my_sleep, 29000000); usleep(my_sleep); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending job step"); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return SLURM_SUCCESS; }
/* Process incoming RPCs. Meant to execute as a pthread */ extern void *rpc_mgr(void *no_data) { pthread_attr_t thread_attr_rpc_req; slurm_fd_t sockfd, newsockfd; int i, retry_cnt, sigarray[] = {SIGUSR1, 0}; slurm_addr_t cli_addr; slurmdbd_conn_t *conn_arg = NULL; slurm_mutex_lock(&thread_count_lock); master_thread_id = pthread_self(); slurm_mutex_unlock(&thread_count_lock); (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); /* threads to process individual RPC's are detached */ slurm_attr_init(&thread_attr_rpc_req); if (pthread_attr_setdetachstate (&thread_attr_rpc_req, PTHREAD_CREATE_DETACHED)) fatal("pthread_attr_setdetachstate %m"); /* initialize port for RPCs */ if ((sockfd = slurm_init_msg_engine_port(get_dbd_port())) == SLURM_SOCKET_ERROR) fatal("slurm_init_msg_engine_port error %m"); /* Prepare to catch SIGUSR1 to interrupt accept(). * This signal is generated by the slurmdbd signal * handler thread upon receipt of SIGABRT, SIGINT, * or SIGTERM. That thread does all processing of * all signals. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); /* * Process incoming RPCs until told to shutdown */ while ((i = _wait_for_server_thread()) >= 0) { /* * accept needed for stream implementation is a no-op in * message implementation that just passes sockfd to newsockfd */ if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr)) == SLURM_SOCKET_ERROR) { _free_server_thread((pthread_t) 0); if (errno != EINTR) error("slurm_accept_msg_conn: %m"); continue; } fd_set_nonblocking(newsockfd); conn_arg = xmalloc(sizeof(slurmdbd_conn_t)); conn_arg->newsockfd = newsockfd; slurm_get_ip_str(&cli_addr, &conn_arg->orig_port, conn_arg->ip, sizeof(conn_arg->ip)); retry_cnt = 0; while (pthread_create(&slave_thread_id[i], &thread_attr_rpc_req, _service_connection, (void *) conn_arg)) { if (retry_cnt > 0) { error("pthread_create failure, " "aborting RPC: %m"); close(newsockfd); break; } error("pthread_create failure: %m"); retry_cnt++; usleep(1000); /* retry in 1 msec */ } } debug3("rpc_mgr shutting down"); slurm_attr_destroy(&thread_attr_rpc_req); (void) slurm_shutdown_msg_engine(sockfd); _wait_for_thread_fini(); pthread_exit((void *) 0); return NULL; }
int main(int argc, char *argv[]) { log_options_t logopt = LOG_OPTS_STDERR_ONLY; job_desc_msg_t desc; resource_allocation_response_msg_t *alloc; time_t before, after; allocation_msg_thread_t *msg_thr; char **env = NULL; int status = 0; int retries = 0; pid_t pid = getpid(); pid_t tpgid = 0; pid_t rc_pid = 0; int i, rc = 0; static char *msg = "Slurm job queue full, sleeping and retrying."; slurm_allocation_callbacks_t callbacks; log_init(xbasename(argv[0]), logopt, 0, NULL); _set_exit_code(); if (spank_init_allocator() < 0) { error("Failed to initialize plugin stack"); exit(error_exit); } /* Be sure to call spank_fini when salloc exits */ if (atexit((void (*) (void)) spank_fini) < 0) error("Failed to register atexit handler for plugins: %m"); if (initialize_and_process_args(argc, argv) < 0) { error("salloc parameter parsing"); exit(error_exit); } /* reinit log with new verbosity (if changed by command line) */ if (opt.verbose || opt.quiet) { logopt.stderr_level += opt.verbose; logopt.stderr_level -= opt.quiet; logopt.prefix_level = 1; log_alter(logopt, 0, NULL); } if (spank_init_post_opt() < 0) { error("Plugin stack post-option processing failed"); exit(error_exit); } _set_spank_env(); _set_submit_dir_env(); if (opt.cwd && chdir(opt.cwd)) { error("chdir(%s): %m", opt.cwd); exit(error_exit); } if (opt.get_user_env_time >= 0) { char *user = uid_to_string(opt.uid); if (strcmp(user, "nobody") == 0) { error("Invalid user id %u: %m", (uint32_t)opt.uid); exit(error_exit); } env = env_array_user_default(user, opt.get_user_env_time, opt.get_user_env_mode); xfree(user); if (env == NULL) exit(error_exit); /* error already logged */ _set_rlimits(env); } /* * Job control for interactive salloc sessions: only if ... * * a) input is from a terminal (stdin has valid termios attributes), * b) controlling terminal exists (non-negative tpgid), * c) salloc is not run in allocation-only (--no-shell) mode, * d) salloc runs in its own process group (true in interactive * shells that support job control), * e) salloc has been configured at compile-time to support background * execution and is not currently in the background process group. */ if (tcgetattr(STDIN_FILENO, &saved_tty_attributes) < 0) { /* * Test existence of controlling terminal (tpgid > 0) * after first making sure stdin is not redirected. */ } else if ((tpgid = tcgetpgrp(STDIN_FILENO)) < 0) { if (!opt.no_shell) { error("no controlling terminal: please set --no-shell"); exit(error_exit); } } else if ((!opt.no_shell) && (pid == getpgrp())) { if (tpgid == pid) is_interactive = true; #ifdef SALLOC_RUN_FOREGROUND while (tcgetpgrp(STDIN_FILENO) != pid) { if (!is_interactive) { error("Waiting for program to be placed in " "the foreground"); is_interactive = true; } killpg(pid, SIGTTIN); } #endif } /* * Reset saved tty attributes at exit, in case a child * process died before properly resetting terminal. */ if (is_interactive) atexit (_reset_input_mode); /* * Request a job allocation */ slurm_init_job_desc_msg(&desc); if (_fill_job_desc_from_opts(&desc) == -1) { exit(error_exit); } if (opt.gid != (gid_t) -1) { if (setgid(opt.gid) < 0) { error("setgid: %m"); exit(error_exit); } } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&desc.other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); before = time(NULL); while ((alloc = slurm_allocate_resources_blocking(&desc, opt.immediate, _pending_callback)) == NULL) { if ((errno != ESLURM_ERROR_ON_DESC_TO_RECORD_COPY) || (retries >= MAX_RETRIES)) break; if (retries == 0) error("%s", msg); else debug("%s", msg); sleep (++retries); } /* become the user after the allocation has been requested. */ if (opt.uid != (uid_t) -1) { if (setuid(opt.uid) < 0) { error("setuid: %m"); exit(error_exit); } } if (alloc == NULL) { if (allocation_interrupted) { /* cancelled by signal */ info("Job aborted due to signal"); } else if (errno == EINTR) { error("Interrupted by signal." " Allocation request rescinded."); } else if (opt.immediate && ((errno == ETIMEDOUT) || (errno == ESLURM_NOT_TOP_PRIORITY) || (errno == ESLURM_NODES_BUSY))) { error("Unable to allocate resources: %m"); error_exit = immediate_exit; } else { error("Failed to allocate resources: %m"); } slurm_allocation_msg_thr_destroy(msg_thr); exit(error_exit); } else if (!allocation_interrupted) { /* * Allocation granted! */ info("Granted job allocation %u", alloc->job_id); pending_job_id = alloc->job_id; #ifdef HAVE_BG if (!_wait_bluegene_block_ready(alloc)) { if(!allocation_interrupted) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else if (!_wait_nodes_ready(alloc)) { if(!allocation_interrupted) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } after = time(NULL); if (opt.bell == BELL_ALWAYS || (opt.bell == BELL_AFTER_DELAY && ((after - before) > DEFAULT_BELL_DELAY))) { _ring_terminal_bell(); } if (opt.no_shell) exit(0); if (allocation_interrupted) { /* salloc process received a signal after * slurm_allocate_resources_blocking returned with the * allocation, but before the new signal handlers were * registered. */ goto relinquish; } /* * Run the user's command. */ if (env_array_for_job(&env, alloc, &desc) != SLURM_SUCCESS) goto relinquish; /* Add default task count for srun, if not already set */ if (opt.ntasks_set) { env_array_append_fmt(&env, "SLURM_NTASKS", "%d", opt.ntasks); /* keep around for old scripts */ env_array_append_fmt(&env, "SLURM_NPROCS", "%d", opt.ntasks); } if (opt.cpus_per_task > 1) { env_array_append_fmt(&env, "SLURM_CPUS_PER_TASK", "%d", opt.cpus_per_task); } if (opt.overcommit) { env_array_append_fmt(&env, "SLURM_OVERCOMMIT", "%d", opt.overcommit); } if (opt.acctg_freq >= 0) { env_array_append_fmt(&env, "SLURM_ACCTG_FREQ", "%d", opt.acctg_freq); } if (opt.network) env_array_append_fmt(&env, "SLURM_NETWORK", "%s", opt.network); env_array_set_environment(env); env_array_free(env); pthread_mutex_lock(&allocation_state_lock); if (allocation_state == REVOKED) { error("Allocation was revoked for job %u before command could " "be run", alloc->job_id); pthread_mutex_unlock(&allocation_state_lock); if (slurm_complete_job(alloc->job_id, status) != 0) { error("Unable to clean up allocation for job %u: %m", alloc->job_id); } return 1; } allocation_state = GRANTED; pthread_mutex_unlock(&allocation_state_lock); /* Ensure that salloc has initial terminal foreground control. */ if (is_interactive) { /* * Ignore remaining job-control signals (other than those in * sig_array, which at this state act like SIG_IGN). */ xsignal(SIGTSTP, SIG_IGN); xsignal(SIGTTIN, SIG_IGN); xsignal(SIGTTOU, SIG_IGN); pid = getpid(); setpgid(pid, pid); tcsetpgrp(STDIN_FILENO, pid); } command_pid = _fork_command(command_argv); /* * Wait for command to exit, OR for waitpid to be interrupted by a * signal. Either way, we are going to release the allocation next. */ if (command_pid > 0) { setpgid(command_pid, command_pid); if (is_interactive) tcsetpgrp(STDIN_FILENO, command_pid); /* NOTE: Do not process signals in separate pthread. * The signal will cause waitpid() to exit immediately. */ xsignal(SIGHUP, _exit_on_signal); /* Use WUNTRACED to treat stopped children like terminated ones */ do { rc_pid = waitpid(command_pid, &status, WUNTRACED); } while ((rc_pid == -1) && (!exit_flag)); if ((rc_pid == -1) && (errno != EINTR)) error("waitpid for %s failed: %m", command_argv[0]); } if (is_interactive) tcsetpgrp(STDIN_FILENO, pid); /* * Relinquish the job allocation (if not already revoked). */ relinquish: pthread_mutex_lock(&allocation_state_lock); if (allocation_state != REVOKED) { pthread_mutex_unlock(&allocation_state_lock); info("Relinquishing job allocation %d", alloc->job_id); if ((slurm_complete_job(alloc->job_id, status) != 0) && (slurm_get_errno() != ESLURM_ALREADY_DONE)) error("Unable to clean up job allocation %d: %m", alloc->job_id); pthread_mutex_lock(&allocation_state_lock); allocation_state = REVOKED; } pthread_mutex_unlock(&allocation_state_lock); slurm_free_resource_allocation_response_msg(alloc); slurm_allocation_msg_thr_destroy(msg_thr); /* * Figure out what return code we should use. If the user's command * exited normally, return the user's return code. */ rc = 1; if (rc_pid != -1) { if (WIFEXITED(status)) { rc = WEXITSTATUS(status); } else if (WIFSIGNALED(status)) { verbose("Command \"%s\" was terminated by signal %d", command_argv[0], WTERMSIG(status)); /* if we get these signals we return a normal * exit since this was most likely sent from the * user */ switch(WTERMSIG(status)) { case SIGHUP: case SIGINT: case SIGQUIT: case SIGKILL: rc = 0; break; default: break; } } } return rc; }
int main (int argc, char *argv[]) { int i, pidfd; int blocked_signals[] = {SIGPIPE, 0}; int cc; char *oom_value; uint32_t slurmd_uid = 0; uint32_t curr_uid = 0; char time_stamp[256]; log_options_t lopts = LOG_OPTS_INITIALIZER; /* NOTE: logfile is NULL at this point */ log_init(argv[0], lopts, LOG_DAEMON, NULL); /* * Make sure we have no extra open files which * would be propagated to spawned tasks. */ cc = sysconf(_SC_OPEN_MAX); for (i = 3; i < cc; i++) close(i); /* * Drop supplementary groups. */ if (geteuid() == 0) { if (setgroups(0, NULL) != 0) { fatal("Failed to drop supplementary groups, " "setgroups: %m"); } } else { debug("Not running as root. Can't drop supplementary groups"); } /* * Create and set default values for the slurmd global * config variable "conf" */ conf = xmalloc(sizeof(slurmd_conf_t)); _init_conf(); conf->argv = &argv; conf->argc = &argc; if (_slurmd_init() < 0) { error( "slurmd initialization failed" ); fflush( NULL ); exit(1); } slurmd_uid = slurm_get_slurmd_user_id(); curr_uid = getuid(); if (curr_uid != slurmd_uid) { struct passwd *pw = NULL; char *slurmd_user = NULL; char *curr_user = NULL; /* since when you do a getpwuid you get a pointer to a * structure you have to do a xstrdup on the first * call or your information will just get over * written. This is a memory leak, but a fatal is * called right after so it isn't that big of a deal. */ if ((pw=getpwuid(slurmd_uid))) slurmd_user = xstrdup(pw->pw_name); if ((pw=getpwuid(curr_uid))) curr_user = pw->pw_name; fatal("You are running slurmd as something " "other than user %s(%d). If you want to " "run as this user add SlurmdUser=%s " "to the slurm.conf file.", slurmd_user, slurmd_uid, curr_user); } init_setproctitle(argc, argv); xsignal(SIGTERM, &_term_handler); xsignal(SIGINT, &_term_handler); xsignal(SIGHUP, &_hup_handler ); xsignal_block(blocked_signals); debug3("slurmd initialization successful"); /* * Become a daemon if desired. * Do not chdir("/") or close all fd's */ if (conf->daemonize) { if (daemon(1,1) == -1) error("Couldn't daemonize slurmd: %m"); } test_core_limit(); info("slurmd version %s started", SLURM_VERSION_STRING); debug3("finished daemonize"); if ((oom_value = getenv("SLURMD_OOM_ADJ"))) { i = atoi(oom_value); debug("Setting slurmd oom_adj to %d", i); set_oom_adj(i); } _kill_old_slurmd(); if (conf->mlock_pages) { /* * Call mlockall() if available to ensure slurmd * doesn't get swapped out */ #ifdef _POSIX_MEMLOCK if (mlockall (MCL_FUTURE | MCL_CURRENT) < 0) error ("failed to mlock() slurmd pages: %m"); #else error ("mlockall() system call does not appear to be available"); #endif /* _POSIX_MEMLOCK */ } /* * Restore any saved revoked credential information */ if (!conf->cleanstart && (_restore_cred_state(conf->vctx) < 0)) return SLURM_FAILURE; if (jobacct_gather_init() != SLURM_SUCCESS) fatal("Unable to initialize jobacct_gather"); if (job_container_init() < 0) fatal("Unable to initialize job_container plugin."); if (container_g_restore(conf->spooldir, !conf->cleanstart)) error("Unable to restore job_container state."); if (switch_g_node_init() < 0) fatal("Unable to initialize interconnect."); if (conf->cleanstart && switch_g_clear_node_state()) fatal("Unable to clear interconnect state."); switch_g_slurmd_init(); _create_msg_socket(); conf->pid = getpid(); /* This has to happen after daemon(), which closes all fd's, so we keep the write lock of the pidfile. */ pidfd = create_pidfile(conf->pidfile, 0); rfc2822_timestamp(time_stamp, sizeof(time_stamp)); info("%s started on %s", slurm_prog_name, time_stamp); _install_fork_handlers(); list_install_fork_handlers(); slurm_conf_install_fork_handlers(); /* * Initialize any plugins */ if (slurmd_plugstack_init()) fatal("failed to initialize slurmd_plugstack"); _spawn_registration_engine(); _msg_engine(); /* * Close fd here, otherwise we'll deadlock since create_pidfile() * flocks the pidfile. */ if (pidfd >= 0) /* valid pidfd, non-error */ (void) close(pidfd); /* Ignore errors */ if (unlink(conf->pidfile) < 0) error("Unable to remove pidfile `%s': %m", conf->pidfile); _wait_for_all_threads(120); _slurmd_fini(); _destroy_conf(); slurm_crypto_fini(); /* must be after _destroy_conf() */ info("Slurmd shutdown completing"); log_fini(); return 0; }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job, slurm_opt_t *opt_local) { srun_opt_t *srun_opt = opt_local->srun_opt; int i, j, rc; unsigned long step_wait = 0; uint16_t base_dist, slurmctld_timeout; char *add_tres; xassert(srun_opt); if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.step_id = job->stepid; job->ctx_params.uid = opt_local->uid; /* Validate minimum and maximum node counts */ if (opt_local->min_nodes && opt_local->max_nodes && (opt_local->min_nodes > opt_local->max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt_local->min_nodes, opt_local->max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END if (opt_local->min_nodes && (opt_local->min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt_local->min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt_local->min_nodes && (opt_local->min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt_local->min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt_local->max_nodes && (opt_local->max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt_local->max_nodes; if (!opt_local->ntasks_set && (opt_local->ntasks_per_node != NO_VAL)) job->ntasks = opt_local->ntasks = job->nhosts * opt_local->ntasks_per_node; job->ctx_params.task_count = opt_local->ntasks; if (opt_local->mem_per_cpu != NO_VAL64) job->ctx_params.pn_min_memory = opt_local->mem_per_cpu | MEM_PER_CPU; else if (opt_local->pn_min_memory != NO_VAL64) job->ctx_params.pn_min_memory = opt_local->pn_min_memory; if (opt_local->overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt_local->cpus_set) { job->ctx_params.cpu_count = opt_local->ntasks * opt_local->cpus_per_task; } else if (opt_local->ntasks_set) { job->ctx_params.cpu_count = opt_local->ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt_local->ntasks; } job->ctx_params.cpu_freq_min = opt_local->cpu_freq_min; job->ctx_params.cpu_freq_max = opt_local->cpu_freq_max; job->ctx_params.cpu_freq_gov = opt_local->cpu_freq_gov; job->ctx_params.relative = (uint16_t)srun_opt->relative; job->ctx_params.ckpt_interval = (uint16_t)srun_opt->ckpt_interval; job->ctx_params.ckpt_dir = srun_opt->ckpt_dir; job->ctx_params.exclusive = (uint16_t)srun_opt->exclusive; if (opt_local->immediate == 1) job->ctx_params.immediate = (uint16_t)opt_local->immediate; if (opt_local->time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt_local->time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (srun_opt->resv_port_cnt != NO_VAL) { job->ctx_params.resv_port_cnt = (uint16_t)srun_opt->resv_port_cnt; } else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (srun_opt->multi_prog ? srun_opt->multi_prog_cmds + 1 : 1); #endif } switch (opt_local->distribution & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt_local->distribution; if (opt_local->ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt_local->ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt_local->plane_size; break; default: /* Leave distribution set to unknown if taskcount <= nodes and * memory is set to 0. step_mgr will handle the 0mem case. * ex. SallocDefaultCommand=srun -n1 -N1 --mem=0 ... */ if (!opt_local->mem_per_cpu || !opt_local->pn_min_memory) base_dist = SLURM_DIST_UNKNOWN; else base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt_local->distribution &= SLURM_DIST_STATE_FLAGS; opt_local->distribution |= base_dist; job->ctx_params.task_dist = opt_local->distribution; if (opt_local->ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt_local->ntasks_per_node; break; } job->ctx_params.overcommit = opt_local->overcommit ? 1 : 0; job->ctx_params.node_list = opt_local->nodelist; job->ctx_params.network = opt_local->network; job->ctx_params.no_kill = opt_local->no_kill; if (srun_opt->job_name_set_cmd && opt_local->job_name) job->ctx_params.name = opt_local->job_name; else job->ctx_params.name = srun_opt->cmd_name; job->ctx_params.features = opt_local->constraints; if (opt_local->cpus_per_gpu) { xstrfmtcat(job->ctx_params.cpus_per_tres, "gpu:%d", opt_local->cpus_per_gpu); } xfree(opt_local->tres_bind); /* Vestigial value from job allocate */ if (opt_local->gpu_bind) xstrfmtcat(opt_local->tres_bind, "gpu:%s", opt_local->gpu_bind); if (tres_bind_verify_cmdline(opt_local->tres_bind)) { if (tres_bind_err_log) { /* Log once */ error("Invalid --tres-bind argument: %s. Ignored", opt_local->tres_bind); tres_bind_err_log = false; } xfree(opt_local->tres_bind); } job->ctx_params.tres_bind = xstrdup(opt_local->tres_bind); xfree(opt_local->tres_freq); /* Vestigial value from job allocate */ xfmt_tres_freq(&opt_local->tres_freq, "gpu", opt_local->gpu_freq); if (tres_freq_verify_cmdline(opt_local->tres_freq)) { if (tres_freq_err_log) { /* Log once */ error("Invalid --tres-freq argument: %s. Ignored", opt_local->tres_freq); tres_freq_err_log = false; } xfree(opt_local->tres_freq); } job->ctx_params.tres_freq = xstrdup(opt_local->tres_freq); job->ctx_params.tres_per_step = xstrdup(opt_local->tres_per_job); xfmt_tres(&job->ctx_params.tres_per_step, "gpu", opt_local->gpus); xfmt_tres(&job->ctx_params.tres_per_node, "gpu", opt_local->gpus_per_node); if (opt_local->gres) add_tres = opt_local->gres; else add_tres = getenv("SLURM_STEP_GRES"); if (add_tres) { if (job->ctx_params.tres_per_node) { xstrfmtcat(job->ctx_params.tres_per_node, ",%s", add_tres); } else job->ctx_params.tres_per_node = xstrdup(add_tres); } xfmt_tres(&job->ctx_params.tres_per_socket, "gpu", opt_local->gpus_per_socket); xfmt_tres(&job->ctx_params.tres_per_task, "gpu", opt_local->gpus_per_task); if (opt_local->mem_per_gpu) { xstrfmtcat(job->ctx_params.mem_per_tres, "gpu:%"PRIi64, opt.mem_per_gpu); } debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); for (i = 0; (!(*destroy_job)); i++) { if (srun_opt->no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else { if (opt_local->immediate) { step_wait = MAX(1, opt_local->immediate - difftime(time(NULL), srun_begin_time)) * 1000; } else { slurmctld_timeout = MIN(300, MAX(60, slurm_get_slurmctld_timeout())); step_wait = ((getpid() % 10) + slurmctld_timeout) * 1000; } job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) { info("Step created for job %u", job->ctx_params.job_id); } break; } rc = slurm_get_errno(); if (((opt_local->immediate != 0) && ((opt_local->immediate == 1) || (difftime(time(NULL), srun_begin_time) >= opt_local->immediate))) || ((rc != ESLURM_PROLOG_RUNNING) && !slurm_step_retry_errno(rc))) { error("Unable to create step for job %u: %m", job->ctx_params.job_id); return SLURM_ERROR; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job %u step creation temporarily disabled, retrying", job->ctx_params.job_id); } xsignal_unblock(sig_array); for (j = 0; sig_array[j]; j++) xsignal(sig_array[j], signal_function); } else { verbose("Job %u step creation still disabled, retrying", job->ctx_params.job_id); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending step for job %u", job->ctx_params.job_id); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* * Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job, opt_local); return SLURM_SUCCESS; }
/* * _thread_per_group_rpc - thread to issue an RPC for a group of nodes * sending message out to one and forwarding it to * others if necessary. * IN/OUT args - pointer to task_info_t, xfree'd on completion */ static void *_thread_per_group_rpc(void *args) { int rc = SLURM_SUCCESS; slurm_msg_t msg; task_info_t *task_ptr = (task_info_t *) args; /* we cache some pointers from task_info_t because we need * to xfree args before being finished with their use. xfree * is required for timely termination of this pthread because * xfree could lock it at the end, preventing a timely * thread_exit */ pthread_mutex_t *thread_mutex_ptr = task_ptr->thread_mutex_ptr; pthread_cond_t *thread_cond_ptr = task_ptr->thread_cond_ptr; uint32_t *threads_active_ptr = task_ptr->threads_active_ptr; thd_t *thread_ptr = task_ptr->thread_struct_ptr; state_t thread_state = DSH_NO_RESP; slurm_msg_type_t msg_type = task_ptr->msg_type; bool is_kill_msg, srun_agent; List ret_list = NULL; ListIterator itr; ret_data_info_t *ret_data_info = NULL; int found = 0; int sig_array[2] = {SIGUSR1, 0}; /* Locks: Write job, write node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; xassert(args != NULL); xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sig_array); is_kill_msg = ( (msg_type == REQUEST_KILL_TIMELIMIT) || (msg_type == REQUEST_TERMINATE_JOB) ); srun_agent = ( (msg_type == SRUN_PING) || (msg_type == SRUN_EXEC) || (msg_type == SRUN_JOB_COMPLETE) || (msg_type == SRUN_STEP_MISSING) || (msg_type == SRUN_TIMEOUT) || (msg_type == SRUN_USER_MSG) || (msg_type == RESPONSE_RESOURCE_ALLOCATION) || (msg_type == SRUN_NODE_FAIL) ); thread_ptr->start_time = time(NULL); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->state = DSH_ACTIVE; thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT; slurm_mutex_unlock(thread_mutex_ptr); /* send request message */ slurm_msg_t_init(&msg); msg.msg_type = msg_type; msg.data = task_ptr->msg_args_ptr; #if 0 info("sending message type %u to %s", msg_type, thread_ptr->nodelist); #endif if (task_ptr->get_reply) { if(thread_ptr->addr) { msg.address = *thread_ptr->addr; if(!(ret_list = slurm_send_addr_recv_msgs( &msg, thread_ptr->nodelist, 0))) { error("_thread_per_group_rpc: " "no ret_list given"); goto cleanup; } } else { if(!(ret_list = slurm_send_recv_msgs( thread_ptr->nodelist, &msg, 0, true))) { error("_thread_per_group_rpc: " "no ret_list given"); goto cleanup; } } } else { if(thread_ptr->addr) { //info("got the address"); msg.address = *thread_ptr->addr; } else { //info("no address given"); if(slurm_conf_get_addr(thread_ptr->nodelist, &msg.address) == SLURM_ERROR) { error("_thread_per_group_rpc: " "can't find address for host %s, " "check slurm.conf", thread_ptr->nodelist); goto cleanup; } } //info("sending %u to %s", msg_type, thread_ptr->nodelist); if (slurm_send_only_node_msg(&msg) == SLURM_SUCCESS) { thread_state = DSH_DONE; } else { if (!srun_agent) _comm_err(thread_ptr->nodelist, msg_type); } goto cleanup; } //info("got %d messages back", list_count(ret_list)); found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr)) != NULL) { rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); /* SPECIAL CASE: Mark node as IDLE if job already complete */ if (is_kill_msg && (rc == ESLURMD_KILL_JOB_ALREADY_COMPLETE)) { kill_job_msg_t *kill_job; kill_job = (kill_job_msg_t *) task_ptr->msg_args_ptr; rc = SLURM_SUCCESS; lock_slurmctld(job_write_lock); if (job_epilog_complete(kill_job->job_id, ret_data_info-> node_name, rc)) run_scheduler = true; unlock_slurmctld(job_write_lock); } /* SPECIAL CASE: Kill non-startable batch job, * Requeue the job on ESLURMD_PROLOG_FAILED */ if ((msg_type == REQUEST_BATCH_JOB_LAUNCH) && (rc != SLURM_SUCCESS) && (rc != ESLURMD_PROLOG_FAILED) && (ret_data_info->type != RESPONSE_FORWARD_FAILED)) { batch_job_launch_msg_t *launch_msg_ptr = task_ptr->msg_args_ptr; uint32_t job_id = launch_msg_ptr->job_id; info("Killing non-startable batch job %u: %s", job_id, slurm_strerror(rc)); thread_state = DSH_DONE; ret_data_info->err = thread_state; lock_slurmctld(job_write_lock); job_complete(job_id, 0, false, false, _wif_status()); unlock_slurmctld(job_write_lock); continue; } if (((msg_type == REQUEST_SIGNAL_TASKS) || (msg_type == REQUEST_TERMINATE_TASKS)) && (rc == ESRCH)) { /* process is already dead, not a real error */ rc = SLURM_SUCCESS; } switch (rc) { case SLURM_SUCCESS: /* debug("agent processed RPC to node %s", */ /* ret_data_info->node_name); */ thread_state = DSH_DONE; break; case SLURM_UNKNOWN_FORWARD_ADDR: error("We were unable to forward message to '%s'. " "Make sure the slurm.conf for each slurmd " "contain all other nodes in your system.", ret_data_info->node_name); thread_state = DSH_NO_RESP; break; case ESLURMD_EPILOG_FAILED: error("Epilog failure on host %s, " "setting DOWN", ret_data_info->node_name); thread_state = DSH_FAILED; break; case ESLURMD_PROLOG_FAILED: thread_state = DSH_FAILED; break; case ESLURM_INVALID_JOB_ID: /* Not indicative of a real error */ case ESLURMD_JOB_NOTRUNNING: /* Not indicative of a real error */ debug2("agent processed RPC to node %s: %s", ret_data_info->node_name, slurm_strerror(rc)); thread_state = DSH_DONE; break; default: if (!srun_agent) { if (ret_data_info->err) errno = ret_data_info->err; else errno = rc; rc = _comm_err(ret_data_info->node_name, msg_type); } if (srun_agent) thread_state = DSH_FAILED; else if(ret_data_info->type == RESPONSE_FORWARD_FAILED) /* check if a forward failed */ thread_state = DSH_NO_RESP; else { /* some will fail that don't mean anything went * bad like a job term request on a job that is * already finished, we will just exit on those * cases */ thread_state = DSH_DONE; } } ret_data_info->err = thread_state; } list_iterator_destroy(itr); cleanup: xfree(args); /* handled at end of thread just in case resend is needed */ destroy_forward(&msg.forward); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->ret_list = ret_list; thread_ptr->state = thread_state; thread_ptr->end_time = (time_t) difftime(time(NULL), thread_ptr->start_time); /* Signal completion so another thread can replace us */ (*threads_active_ptr)--; pthread_cond_signal(thread_cond_ptr); slurm_mutex_unlock(thread_mutex_ptr); return (void *) NULL; }
/* _background_rpc_mgr - Read and process incoming RPCs to the background * controller (that's us) */ static void *_background_rpc_mgr(void *no_data) { slurm_fd_t newsockfd; slurm_fd_t sockfd; slurm_addr_t cli_addr; slurm_msg_t *msg = NULL; int error_code; char* node_addr = NULL; /* Read configuration only */ slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; int sigarray[] = {SIGUSR1, 0}; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); debug3("_background_rpc_mgr pid = %lu", (unsigned long) getpid()); /* initialize port for RPCs */ lock_slurmctld(config_read_lock); /* set node_addr to bind to (NULL means any) */ if ((strcmp(slurmctld_conf.backup_controller, slurmctld_conf.backup_addr) != 0)) { node_addr = slurmctld_conf.backup_addr ; } if ((sockfd = slurm_init_msg_engine_addrname_port(node_addr, slurmctld_conf. slurmctld_port)) == SLURM_SOCKET_ERROR) fatal("slurm_init_msg_engine_addrname_port error %m"); unlock_slurmctld(config_read_lock); /* Prepare to catch SIGUSR1 to interrupt accept(). * This signal is generated by the slurmctld signal * handler thread upon receipt of SIGABRT, SIGINT, * or SIGTERM. That thread does all processing of * all signals. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); /* * Process incoming RPCs indefinitely */ while (slurmctld_config.shutdown_time == 0) { /* accept needed for stream implementation * is a no-op in message implementation that just passes * sockfd to newsockfd */ if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr)) == SLURM_SOCKET_ERROR) { if (errno != EINTR) error("slurm_accept_msg_conn: %m"); continue; } msg = xmalloc(sizeof(slurm_msg_t)); slurm_msg_t_init(msg); if (slurm_receive_msg(newsockfd, msg, 0) != 0) error("slurm_receive_msg: %m"); error_code = _background_process_msg(msg); if ((error_code == SLURM_SUCCESS) && (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE) && (slurmctld_config.shutdown_time == 0)) slurmctld_config.shutdown_time = time(NULL); slurm_free_msg_data(msg->msg_type, msg->data); slurm_free_msg(msg); slurm_close(newsockfd); /* close new socket */ } debug3("_background_rpc_mgr shutting down"); slurm_close(sockfd); /* close the main socket */ pthread_exit((void *) 0); return NULL; }
int main(int argc, char **argv) { int debug = 0, conftest = 0, nodaemon = 0; int i = 0; unsigned int clilen; struct sockaddr_in cliaddr; struct timeval start, end; char *cnfg = DEF_CNFG; char localhost[MAXNAMLEN + 1], path[MAXNAMLEN + 1]; FILE *pidfp; octet_t oct; ruleset_t *rs; /* * Who am I running as ? */ uname(&myname); /* * spocp_err = 0 ; */ memset(&srv, 0, sizeof(srv_t)); pthread_mutex_init(&(srv.mutex), NULL); pthread_mutex_init(&(srv.mlock), NULL); gethostname(localhost, MAXNAMLEN); #ifdef HAVE_GETDOMAINNAME getdomainname(path, MAXNAMLEN); #else { char *pos; if(pos = strstr(localhost, ".")) strncpy(path, pos+1, MAXNAMLEN); else strcpy(path, ""); } #endif if (0) printf("Domain: %s\n", path); srv.hostname = Strdup(localhost); /* * truncating input strings to reasonable length */ for (i = 0; i < argc; i++) if (strlen(argv[i]) > 512) argv[i][512] = '\0'; while ((i = getopt(argc, argv, "Dhrtf:d:")) != EOF) { switch (i) { case 'D': nodaemon = 1; break; case 'f': cnfg = Strdup(optarg); break; case 'd': debug = atoi(optarg); if (debug < 0) debug = 0; break; case 't': conftest = 1; break; case 'r': srv.readonly = 1; case 'h': default: fprintf(stderr, "Usage: %s [-t] ", argv[0]); fprintf(stderr, "[-f configfile] "); fprintf(stderr, "[-D] [-d debuglevel]\n"); exit(0); } } srv.root = ruleset_new(0); if (srv_init(&srv, cnfg) < 0) exit(1); if (srv.port && srv.uds) { fprintf(stderr, "Sorry are not allowed to listen on both a unix domain socket and a port\n"); exit(1); } if (srv.logfile) spocp_open_log(srv.logfile, debug); else if (debug) spocp_open_log(0, debug); if (srv.name){ localcontext = (char *) Calloc(strlen(srv.name) + strlen("//") + 1, sizeof(char)); /* Flawfinder: ignore */ sprintf(localcontext, "//%s", srv.name); } else { localcontext = (char *) Calloc(strlen(localhost) + strlen("//") + 1, sizeof(char)); /* Flawfinder: ignore */ sprintf(localcontext, "//%s", localhost); } /* * where I put the access rules for access to this server and its * rules */ snprintf(path, MAXNAMLEN, "%s/server", localcontext); oct_assign(&oct, path); if ((rs = ruleset_create(&oct, srv.root)) == 0) exit(1); rs->db = db_new(); /* * access rules for operations */ snprintf(path, MAXNAMLEN, "%s/operation", localcontext); oct_assign(&oct, path); if ((rs = ruleset_create(&oct, srv.root)) == 0) exit(1); rs->db = db_new(); LOG(SPOCP_INFO) { traceLog(LOG_INFO, "Local context: \"%s\"", localcontext); traceLog(LOG_INFO, "initializing backends"); if (srv.root->db) plugin_display(srv.plugin); } if (srv.plugin) { run_plugin_init(&srv); } if ( get_rules( &srv ) != SPOCP_SUCCESS ) exit(1); /*ruleset_tree( srv.root, 0);*/ /* If only testing configuration and rulefile this is as far as I go */ if (conftest) { traceLog(LOG_INFO,"Configuration was OK"); exit(0); } gettimeofday(&start, NULL); if (srv.port || srv.uds) { /* * stdin and stdout will not be used from here on, close to * save file descriptors */ fclose(stdin); fclose(stdout); #ifdef HAVE_SSL /* * ---------------------------------------------------------- */ /* * build our SSL context, whether it will ever be used or not */ /* * mutex'es for openSSL to use */ THREAD_setup(); if (srv.certificateFile && srv.privateKey && srv.caList) { traceLog(LOG_INFO,"Initializing the TLS/SSL environment"); if (!(srv.ctx = tls_init(&srv))) { return FALSE; } } /* * ---------------------------------------------------------- */ #endif #ifdef HAVE_SASL { int r = sasl_server_init(sasl_cb, "spocp"); if (r != SASL_OK) { traceLog( LOG_ERR, "Unable to initialized SASL library: %s", sasl_errstring(r, NULL, NULL)); return FALSE; } } #endif saci_init(); if( nodaemon == 0 ) { #ifdef HAVE_DAEMON if (daemon(1, 1) < 0) { fprintf(stderr, "couldn't go daemon\n"); exit(1); } #else daemon_init("spocp", 0); #endif } if (srv.pidfile) { /* * Write the PID file. */ pidfp = fopen(srv.pidfile, "w"); if (pidfp == (FILE *) 0) { fprintf(stderr, "Couldn't open pidfile \"%s\"\n", srv.pidfile); exit(1); } fprintf(pidfp, "%d\n", (int) getpid()); fclose(pidfp); } if (srv.port) { LOG(SPOCP_INFO) traceLog( LOG_INFO, "Asked to listen on port %d", srv.port); if ((srv.listen_fd = spocp_stream_socket(srv.port)) < 0) exit(1); srv.id = (char *) Malloc(16); sprintf(srv.id, "spocp-%d", srv.port); srv.type = AF_INET; } else { LOG(SPOCP_INFO) traceLog(LOG_INFO,"Asked to listen on unix domain socket"); if ((srv.listen_fd = spocp_unix_domain_socket(srv.uds)) < 0) exit(1); srv.id = (char *) Malloc(7 + strlen(srv.uds)); /* Flawfinder: ignore */ sprintf(srv.id, "spocp-%s", srv.uds); srv.type = AF_UNIX; } xsignal(SIGCHLD, sig_chld); xsignal(SIGPIPE, sig_pipe); xsignal(SIGINT, sig_int); xsignal(SIGTERM, sig_term); xsignal(SIGUSR1, sig_usr1); clilen = sizeof(cliaddr); DEBUG(SPOCP_DSRV) traceLog(LOG_DEBUG,"Creating threads"); /* * returns the pool the threads are picking work from */ srv.work = tpool_init(srv.threads, 64, 1); spocp_srv_run(&srv); } else { conn_t *conn; saci_init(); DEBUG(SPOCP_DSRV) traceLog(LOG_DEBUG,"---->"); LOG(SPOCP_INFO) traceLog(LOG_INFO,"Reading STDIN"); /* * If I want to use this I have to do init_server() first * conn = spocp_open_connection( STDIN_FILENO, &srv ) ; */ /* * this is much simpler */ conn = conn_new(); conn_setup(conn, &srv, STDIN_FILENO, "localhost", "127.0.0.1"); LOG(SPOCP_INFO) traceLog(LOG_INFO,"Running server"); spocp_server((void *) conn); gettimeofday(&end, NULL); print_elapsed("query time:", start, end); conn_free( conn ); } srv_free( &srv ); if (cnfg != DEF_CNFG) Free( cnfg ); exit(0); }
void progressmeter(int flag) { static off_t lastsize; off_t cursize; struct timeval now, wait; #ifndef NO_PROGRESS struct timeval td; off_t abbrevsize, bytespersec; double elapsed; int ratio, i, remaining, barlength; /* * Work variables for progress bar. * * XXX: if the format of the progress bar changes * (especially the number of characters in the * `static' portion of it), be sure to update * these appropriately. */ #endif size_t len; char buf[256]; /* workspace for progress bar */ #ifndef NO_PROGRESS #define BAROVERHEAD 45 /* non `*' portion of progress bar */ /* * stars should contain at least * sizeof(buf) - BAROVERHEAD entries */ static const char stars[] = "*****************************************************************************" "*****************************************************************************" "*****************************************************************************"; #endif if (flag == -1) { (void)gettimeofday(&start, NULL); lastupdate = start; lastsize = restart_point; } (void)gettimeofday(&now, NULL); cursize = bytes + restart_point; timersub(&now, &lastupdate, &wait); if (cursize > lastsize) { lastupdate = now; lastsize = cursize; wait.tv_sec = 0; } else { #ifndef STANDALONE_PROGRESS if (quit_time > 0 && wait.tv_sec > quit_time) { len = snprintf(buf, sizeof(buf), "\r\n%s: " "transfer aborted because stalled for %lu sec.\r\n", getprogname(), (unsigned long)wait.tv_sec); (void)write(fileno(ttyout), buf, len); (void)xsignal(SIGALRM, SIG_DFL); alarmtimer(0); siglongjmp(toplevel, 1); } #endif /* !STANDALONE_PROGRESS */ } /* * Always set the handler even if we are not the foreground process. */ #ifdef STANDALONE_PROGRESS if (progress) { #else if (quit_time > 0 || progress) { #endif /* !STANDALONE_PROGRESS */ if (flag == -1) { (void)xsignal_restart(SIGALRM, updateprogressmeter, 1); alarmtimer(1); /* set alarm timer for 1 Hz */ } else if (flag == 1) { (void)xsignal(SIGALRM, SIG_DFL); alarmtimer(0); } } #ifndef NO_PROGRESS if (!progress) return; len = 0; /* * print progress bar only if we are foreground process. */ if (! foregroundproc()) return; len += snprintf(buf + len, BUFLEFT, "\r"); if (prefix) len += snprintf(buf + len, BUFLEFT, "%s", prefix); if (filesize > 0) { ratio = (int)((double)cursize * 100.0 / (double)filesize); ratio = MAX(ratio, 0); ratio = MIN(ratio, 100); len += snprintf(buf + len, BUFLEFT, "%3d%% ", ratio); /* * calculate the length of the `*' bar, ensuring that * the number of stars won't exceed the buffer size */ barlength = MIN(sizeof(buf) - 1, ttywidth) - BAROVERHEAD; if (prefix) barlength -= (int)strlen(prefix); if (barlength > 0) { i = barlength * ratio / 100; len += snprintf(buf + len, BUFLEFT, "|%.*s%*s|", i, stars, (int)(barlength - i), ""); } } abbrevsize = cursize; for (i = 0; abbrevsize >= 100000 && i < NSUFFIXES; i++) abbrevsize >>= 10; if (i == NSUFFIXES) i--; len += snprintf(buf + len, BUFLEFT, " " LLFP("5") " %-3s ", (LLT)abbrevsize, suffixes[i]); timersub(&now, &start, &td); elapsed = td.tv_sec + (td.tv_usec / 1000000.0); bytespersec = 0; if (bytes > 0) { bytespersec = bytes; if (elapsed > 0.0) bytespersec /= elapsed; } for (i = 1; bytespersec >= 1024000 && i < NSUFFIXES; i++) bytespersec >>= 10; len += snprintf(buf + len, BUFLEFT, " " LLFP("3") ".%02d %.2sB/s ", (LLT)(bytespersec / 1024), (int)((bytespersec % 1024) * 100 / 1024), suffixes[i]); if (filesize > 0) { if (bytes <= 0 || elapsed <= 0.0 || cursize > filesize) { len += snprintf(buf + len, BUFLEFT, " --:-- ETA"); } else if (wait.tv_sec >= STALLTIME) { len += snprintf(buf + len, BUFLEFT, " - stalled -"); } else { remaining = (int) ((filesize - restart_point) / (bytes / elapsed) - elapsed); if (remaining >= 100 * SECSPERHOUR) len += snprintf(buf + len, BUFLEFT, " --:-- ETA"); else { i = remaining / SECSPERHOUR; if (i) len += snprintf(buf + len, BUFLEFT, "%2d:", i); else len += snprintf(buf + len, BUFLEFT, " "); i = remaining % SECSPERHOUR; len += snprintf(buf + len, BUFLEFT, "%02d:%02d ETA", i / 60, i % 60); } } } if (flag == 1) len += snprintf(buf + len, BUFLEFT, "\n"); (void)write(fileno(ttyout), buf, len); #endif /* !NO_PROGRESS */ } #ifndef STANDALONE_PROGRESS /* * Display transfer statistics. * Requires start to be initialised by progressmeter(-1), * direction to be defined by xfer routines, and filesize and bytes * to be updated by xfer routines * If siginfo is nonzero, an ETA is displayed, and the output goes to stderr * instead of ttyout. */ void ptransfer(int siginfo) { struct timeval now, td, wait; double elapsed; off_t bytespersec; int remaining, hh, i; size_t len; char buf[256]; /* Work variable for transfer status. */ if (!verbose && !progress && !siginfo) return; (void)gettimeofday(&now, NULL); timersub(&now, &start, &td); elapsed = td.tv_sec + (td.tv_usec / 1000000.0); bytespersec = 0; if (bytes > 0) { bytespersec = bytes; if (elapsed > 0.0) bytespersec /= elapsed; } len = 0; len += snprintf(buf + len, BUFLEFT, LLF " byte%s %s in ", (LLT)bytes, bytes == 1 ? "" : "s", direction); remaining = (int)elapsed; if (remaining > SECSPERDAY) { int days; days = remaining / SECSPERDAY; remaining %= SECSPERDAY; len += snprintf(buf + len, BUFLEFT, "%d day%s ", days, days == 1 ? "" : "s"); } hh = remaining / SECSPERHOUR; remaining %= SECSPERHOUR; if (hh) len += snprintf(buf + len, BUFLEFT, "%2d:", hh); len += snprintf(buf + len, BUFLEFT, "%02d:%02d ", remaining / 60, remaining % 60); for (i = 1; bytespersec >= 1024000 && i < NSUFFIXES; i++) bytespersec >>= 10; if (i == NSUFFIXES) i--; len += snprintf(buf + len, BUFLEFT, "(" LLF ".%02d %.2sB/s)", (LLT)(bytespersec / 1024), (int)((bytespersec % 1024) * 100 / 1024), suffixes[i]); if (siginfo && bytes > 0 && elapsed > 0.0 && filesize >= 0 && bytes + restart_point <= filesize) { remaining = (int)((filesize - restart_point) / (bytes / elapsed) - elapsed); hh = remaining / SECSPERHOUR; remaining %= SECSPERHOUR; len += snprintf(buf + len, BUFLEFT, " ETA: "); if (hh) len += snprintf(buf + len, BUFLEFT, "%2d:", hh); len += snprintf(buf + len, BUFLEFT, "%02d:%02d", remaining / 60, remaining % 60); timersub(&now, &lastupdate, &wait); if (wait.tv_sec >= STALLTIME) len += snprintf(buf + len, BUFLEFT, " (stalled)"); } len += snprintf(buf + len, BUFLEFT, "\n"); (void)write(siginfo ? STDERR_FILENO : fileno(ttyout), buf, len); } /* * SIG{INFO,QUIT} handler to print transfer stats if a transfer is in progress */ void psummary(int notused) { int oerrno = errno; if (bytes > 0) { if (fromatty) write(fileno(ttyout), "\n", 1); ptransfer(1); } errno = oerrno; }
/* * Allocate a credential. This function should return NULL if it cannot * allocate a credential. Whether the credential is populated with useful * data at this time is implementation-dependent. */ slurm_auth_credential_t * slurm_auth_create( void *argv[], char *socket ) { int retry = 2; slurm_auth_credential_t *cred = NULL; munge_err_t e = EMUNGE_SUCCESS; munge_ctx_t ctx = munge_ctx_create(); SigFunc *ohandler; if (ctx == NULL) { error("munge_ctx_create failure"); return NULL; } #if 0 /* This logic can be used to determine what socket is used by default. * A typical name is "/var/run/munge/munge.socket.2" */ { char *old_socket; if (munge_ctx_get(ctx, MUNGE_OPT_SOCKET, &old_socket) != EMUNGE_SUCCESS) error("munge_ctx_get failure"); else info("Default Munge socket is %s", old_socket); } #endif if (socket && (munge_ctx_set(ctx, MUNGE_OPT_SOCKET, socket) != EMUNGE_SUCCESS)) { error("munge_ctx_set failure"); munge_ctx_destroy(ctx); return NULL; } cred = xmalloc(sizeof(*cred)); cred->verified = false; cred->m_str = NULL; cred->buf = NULL; cred->len = 0; cred->cr_errno = SLURM_SUCCESS; xassert(cred->magic = MUNGE_MAGIC); /* * Temporarily block SIGALARM to avoid misleading * "Munged communication error" from libmunge if we * happen to time out the connection in this secion of * code. */ ohandler = xsignal(SIGALRM, SIG_BLOCK); again: e = munge_encode(&cred->m_str, ctx, cred->buf, cred->len); if (e != EMUNGE_SUCCESS) { if ((e == EMUNGE_SOCKET) && retry--) { error ("Munge encode failed: %s (retrying ...)", munge_ctx_strerror(ctx)); #ifdef MULTIPLE_SLURMD sleep(1); #endif goto again; } error("Munge encode failed: %s", munge_ctx_strerror(ctx)); xfree( cred ); cred = NULL; plugin_errno = e + MUNGE_ERRNO_OFFSET; } xsignal(SIGALRM, ohandler); munge_ctx_destroy(ctx); return cred; }
static void _handle_sigwinch(int sig) { winch = 1; xsignal(SIGWINCH, _handle_sigwinch); }
int main(int argc, char *argv[]) { enum wavemon_screen cur, next; sigset_t blockmask, oldmask; getconf(argc, argv); if (!isatty(STDIN_FILENO)) errx(1, "input is not from a terminal"); /* honour numeric separators if the environment defines them */ setlocale(LC_NUMERIC, ""); /* initialize the ncurses interface */ initscr(); noecho(); nonl(); cbreak(); curs_set(0); clear(); check_geometry(); start_color(); init_pair(CP_STANDARD, COLOR_WHITE, COLOR_BLACK); init_pair(CP_SCALEHI, COLOR_RED, COLOR_BLACK); init_pair(CP_SCALEMID, COLOR_YELLOW, COLOR_BLACK); init_pair(CP_SCALELOW, COLOR_GREEN, COLOR_BLACK); init_pair(CP_WTITLE, COLOR_CYAN, COLOR_BLACK); init_pair(CP_INACTIVE, COLOR_CYAN, COLOR_BLACK); init_pair(CP_ACTIVE, COLOR_CYAN, COLOR_BLUE); init_pair(CP_STATSIG, COLOR_GREEN, COLOR_BLACK); init_pair(CP_STATNOISE, COLOR_RED, COLOR_BLACK); init_pair(CP_STATSNR, COLOR_BLUE, COLOR_BLUE); init_pair(CP_STATBKG, COLOR_BLUE, COLOR_BLACK); init_pair(CP_STATSIG_S, COLOR_GREEN, COLOR_BLUE); init_pair(CP_STATNOISE_S, COLOR_RED, COLOR_BLUE); init_pair(CP_PREF_NORMAL, COLOR_WHITE, COLOR_BLACK); init_pair(CP_PREF_SELECT, COLOR_WHITE, COLOR_BLUE); init_pair(CP_PREF_ARROW, COLOR_RED, COLOR_BLACK); init_pair(CP_SCAN_CRYPT, COLOR_RED, COLOR_BLACK); init_pair(CP_SCAN_UNENC, COLOR_GREEN, COLOR_BLACK); init_pair(CP_SCAN_NON_AP, COLOR_YELLOW, COLOR_BLACK); /* Override signal handlers installed during ncurses initialisation. */ xsignal(SIGCHLD, SIG_IGN); xsignal(SIGWINCH, sig_winch); /* triggers only when env_winch_ready */ sigemptyset(&blockmask); sigaddset(&blockmask, SIGWINCH); for (cur = conf.startup_scr; cur != SCR_QUIT; cur = next) { WINDOW *w_menu; int escape = 0; if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) < 0) err_sys("cannot block SIGWINCH"); next = cur; w_menu = init_menubar(cur); (*screens[cur].init)(); if (sigprocmask(SIG_SETMASK, &oldmask, NULL) < 0) err_sys("cannot unblock SIGWINCH"); if (sigsetjmp(env_winch, true) == 0) { env_winch_ready = true; do { int key = (*screens[cur].loop)(w_menu); if (key <= 0) usleep(5000); /* * Translate vt100 PF1..4 escape sequences sent * by some X terminals (e.g. aterm) into F1..F4. */ switch (key) { case 033: escape = 1; break; case 'O': escape = 2; break; case 'P' ... 'S': if (escape == 2) key = KEY_F(key - 'P' + 1); /* fall through */ default: escape = 0; } /* Main menu */ switch (key) { case 'i': case KEY_F(1): next = SCR_INFO; break; case 'l': case KEY_F(2): next = SCR_LHIST; break; case 's': case KEY_F(3): next = SCR_SCAN; break; case 'p': case KEY_F(7): next = SCR_PREFS; break; case 'h': case KEY_F(8): next = SCR_HELP; break; case 'a': case KEY_F(9): next = SCR_ABOUT; break; case 'q': case KEY_F(10): next = SCR_QUIT; } } while (next == cur); } delwin(w_menu); (*screens[cur].fini)(); /* * next = cur is set in the protected critical section before * sigsetjmp. Due to the loop condition, it can not occur when * no SIGWINCH occurred, hence it indicates a resizing event. */ if (next == cur) { struct winsize size; if (ioctl(STDIN_FILENO, TIOCGWINSZ, &size) < 0) err_sys("can not determine terminal size"); resizeterm(size.ws_row, size.ws_col); check_geometry(); } clear(); refresh(); }
/* Run client and the server if needed. */ static void start_moc (const struct parameters *params, lists_t_strs *args) { int server_sock; if (params->foreground) { set_me_server (); server_init (params->debug, params->foreground); server_loop (); return; } server_sock = server_connect (); if (server_sock != -1 && params->only_server) fatal ("Server is already running!"); if (server_sock == -1) { int i = 0; int notify_pipe[2]; ssize_t rc; printf ("Running the server...\n"); /* To notify the client that the server socket is ready */ if (pipe(notify_pipe)) fatal ("pipe() failed: %s", xstrerror (errno)); switch (fork()) { case 0: /* child - start server */ set_me_server (); server_init (params->debug, params->foreground); rc = write (notify_pipe[1], &i, sizeof(i)); if (rc < 0) fatal ("write() to notify pipe failed: %s", xstrerror (errno)); close (notify_pipe[0]); close (notify_pipe[1]); server_loop (); options_free (); decoder_cleanup (); io_cleanup (); files_cleanup (); rcc_cleanup (); common_cleanup (); exit (EXIT_SUCCESS); case -1: fatal ("fork() failed: %s", xstrerror (errno)); default: close (notify_pipe[1]); if (read(notify_pipe[0], &i, sizeof(i)) != sizeof(i)) fatal ("Server exited!"); close (notify_pipe[0]); server_sock = server_connect (); if (server_sock == -1) { perror ("server_connect()"); fatal ("Can't connect to the server!"); } } } if (params->only_server) send_int (server_sock, CMD_DISCONNECT); else { xsignal (SIGPIPE, SIG_IGN); if (!ping_server (server_sock)) fatal ("Can't connect to the server!"); init_interface (server_sock, params->debug, args); interface_loop (); interface_end (); } close (server_sock); }
/* Initialize the server - return fd of the listening socket or -1 on error */ void server_init (int debugging, int foreground) { struct sockaddr_un sock_name; pid_t pid; logit ("Starting MOC Server"); assert (server_sock == -1); pid = check_pid_file (); if (pid && valid_pid(pid)) { fprintf (stderr, "\nIt seems that the server is already running" " with pid %d.\n", pid); fprintf (stderr, "If it is not true, remove the pid file (%s)" " and try again.\n", create_file_name(PID_FILE)); fatal ("Exiting!"); } if (foreground) log_init_stream (stdout, "stdout"); else { FILE *logfp; logfp = NULL; if (debugging) { logfp = fopen (SERVER_LOG, "a"); if (!logfp) fatal ("Can't open server log file: %s", xstrerror (errno)); } log_init_stream (logfp, SERVER_LOG); } if (pipe(wake_up_pipe) < 0) fatal ("pipe() failed: %s", xstrerror (errno)); unlink (socket_name()); /* Create a socket. * For reasons why AF_UNIX is the correct constant to use in both * cases, see the commentary the SVN log for commit r9999. */ server_sock = socket (AF_UNIX, SOCK_STREAM, 0); if (server_sock == -1) fatal ("Can't create socket: %s", xstrerror (errno)); sock_name.sun_family = AF_UNIX; strcpy (sock_name.sun_path, socket_name()); /* Bind to socket */ if (bind(server_sock, (struct sockaddr *)&sock_name, SUN_LEN(&sock_name)) == -1) fatal ("Can't bind() to the socket: %s", xstrerror (errno)); if (listen(server_sock, 1) == -1) fatal ("listen() failed: %s", xstrerror (errno)); /* Log stack sizes so stack overflows can be debugged. */ log_process_stack_size (); log_pthread_stack_size (); clients_init (); audio_initialize (); tags_cache = tags_cache_new (options_get_int("TagsCacheSize")); tags_cache_load (tags_cache, create_file_name("cache")); server_tid = pthread_self (); xsignal (SIGTERM, sig_exit); xsignal (SIGINT, foreground ? sig_exit : SIG_IGN); xsignal (SIGHUP, SIG_IGN); xsignal (SIGQUIT, sig_exit); xsignal (SIGPIPE, SIG_IGN); xsignal (SIGCHLD, sig_chld); write_pid_file (); if (!foreground) { setsid (); redirect_output (stdin); redirect_output (stdout); redirect_output (stderr); } return; }