static void _reset_input_mode (void) { /* SIGTTOU needs to be blocked per the POSIX spec: * http://pubs.opengroup.org/onlinepubs/009695399/functions/tcsetattr.html */ int sig_block[] = { SIGTTOU, SIGTTIN, 0 }; xsignal_block (sig_block); tcsetattr (STDIN_FILENO, TCSANOW, &saved_tty_attributes); }
static void _reset_input_mode (void) { /* SIGTTOU needs to be blocked per the POSIX spec: * http://pubs.opengroup.org/onlinepubs/009695399/functions/tcsetattr.html */ int sig_block[] = { SIGTTOU, SIGTTIN, 0 }; xsignal_block (sig_block); tcsetattr (STDIN_FILENO, TCSANOW, &saved_tty_attributes); /* If salloc was run as interactive, with job control, reset the * foreground process group of the terminal to the process group of * the parent pid before exiting */ if (is_interactive) tcsetpgrp(STDIN_FILENO, getpgid(getppid())); }
/* SIGWINCH should already be blocked by srun/signal.c */ void block_sigwinch(void) { xsignal_block(pty_sigarray); }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job) { int i, rc; unsigned long step_wait = 0, my_sleep = 0; time_t begin_time; uint16_t base_dist; if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU; else if (opt.pn_min_memory != NO_VAL) job->ctx_params.pn_min_memory = opt.pn_min_memory; if (opt.gres) job->ctx_params.gres = opt.gres; else job->ctx_params.gres = getenv("SLURM_STEP_GRES"); if (opt.overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt.cpus_set) { job->ctx_params.cpu_count = opt.ntasks * opt.cpus_per_task; } else if (opt.ntasks_set) { job->ctx_params.cpu_count = opt.ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt.ntasks; } job->ctx_params.cpu_freq_min = opt.cpu_freq_min; job->ctx_params.cpu_freq_max = opt.cpu_freq_max; job->ctx_params.cpu_freq_gov = opt.cpu_freq_gov; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (opt.multi_prog ? opt.multi_prog_cmds + 1 : 1); #endif } switch (opt.distribution & SLURM_DIST_STATE_BASE) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution &= SLURM_DIST_STATE_FLAGS; opt.distribution |= base_dist; job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; job->ctx_params.features = opt.constraints; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!(*destroy_job)); i++) { bool blocking_step_create = true; if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else if (opt.immediate) { job->step_ctx = slurm_step_ctx_create( &job->ctx_params); } else { /* Wait 60 to 70 seconds for response */ step_wait = (getpid() % 10) * 1000 + 60000; job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_INTERCONNECT_BUSY) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return SLURM_ERROR; } if (rc == ESLURM_DISABLED) /* job suspended */ blocking_step_create = false; if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], signal_function); if (!blocking_step_create) my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); if (!blocking_step_create) my_sleep *= 2; } if (!blocking_step_create) { /* sleep 0.1 to 29 secs with exponential back-off */ my_sleep = MIN(my_sleep, 29000000); usleep(my_sleep); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending job step"); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return SLURM_SUCCESS; }
/* run_backup - this is the backup controller, it should run in standby * mode, assuming control when the primary controller stops responding */ void run_backup(slurm_trigger_callbacks_t *callbacks) { int i; uint32_t trigger_type; time_t last_ping = 0; pthread_attr_t thread_attr_sig, thread_attr_rpc; slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; info("slurmctld running in background mode"); takeover = false; last_controller_response = time(NULL); /* default: don't resume if shutdown */ slurmctld_config.resume_backup = false; if (xsignal_block(backup_sigarray) < 0) error("Unable to block signals"); /* * create attached thread to process RPCs */ slurm_attr_init(&thread_attr_rpc); while (pthread_create(&slurmctld_config.thread_id_rpc, &thread_attr_rpc, _background_rpc_mgr, NULL)) { error("pthread_create error %m"); sleep(1); } slurm_attr_destroy(&thread_attr_rpc); /* * create attached thread for signal handling */ slurm_attr_init(&thread_attr_sig); while (pthread_create(&slurmctld_config.thread_id_sig, &thread_attr_sig, _background_signal_hand, NULL)) { error("pthread_create %m"); sleep(1); } slurm_attr_destroy(&thread_attr_sig); trigger_type = TRIGGER_TYPE_BU_CTLD_RES_OP; _trigger_slurmctld_event(trigger_type); for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) { sleep(1); /* Give the primary slurmctld set-up time */ } /* repeatedly ping ControlMachine */ while (slurmctld_config.shutdown_time == 0) { sleep(1); /* Lock of slurmctld_conf below not important */ if (slurmctld_conf.slurmctld_timeout && (takeover == false) && (difftime(time(NULL), last_ping) < (slurmctld_conf.slurmctld_timeout / 3))) continue; last_ping = time(NULL); if (_ping_controller() == 0) last_controller_response = time(NULL); else if (takeover) { /* in takeover mode, take control as soon as */ /* primary no longer respond */ break; } else { uint32_t timeout; lock_slurmctld(config_read_lock); timeout = slurmctld_conf.slurmctld_timeout; unlock_slurmctld(config_read_lock); if (difftime(time(NULL), last_controller_response) > timeout) { break; } } } if (slurmctld_config.shutdown_time != 0) { /* Since pidfile is created as user root (its owner is * changed to SlurmUser) SlurmUser may not be able to * remove it, so this is not necessarily an error. * No longer need slurmctld_conf lock after above join. */ if (unlink(slurmctld_conf.slurmctld_pidfile) < 0) verbose("Unable to remove pidfile '%s': %m", slurmctld_conf.slurmctld_pidfile); info("BackupController terminating"); pthread_join(slurmctld_config.thread_id_sig, NULL); log_fini(); if (dump_core) abort(); else exit(0); } lock_slurmctld(config_read_lock); error("ControlMachine %s not responding, " "BackupController %s taking over", slurmctld_conf.control_machine, slurmctld_conf.backup_controller); unlock_slurmctld(config_read_lock); backup_slurmctld_restart(); trigger_primary_ctld_fail(); trigger_backup_ctld_as_ctrl(); pthread_kill(slurmctld_config.thread_id_sig, SIGTERM); pthread_join(slurmctld_config.thread_id_sig, NULL); pthread_join(slurmctld_config.thread_id_rpc, NULL); /* The job list needs to be freed before we run * ctld_assoc_mgr_init, it should be empty here in the first place. */ lock_slurmctld(config_write_lock); job_fini(); init_job_conf(); unlock_slurmctld(config_write_lock); ctld_assoc_mgr_init(callbacks); /* clear old state and read new state */ lock_slurmctld(config_write_lock); if (switch_g_restore(slurmctld_conf.state_save_location, true)) { error("failed to restore switch state"); abort(); } if (read_slurm_conf(2, false)) { /* Recover all state */ error("Unable to recover slurm state"); abort(); } slurmctld_config.shutdown_time = (time_t) 0; unlock_slurmctld(config_write_lock); select_g_select_nodeinfo_set_all(); return; }
extern int create_job_step(srun_job_t *job, bool use_all_cpus) { int i, rc; unsigned long my_sleep = 0; time_t begin_time; slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* set the jobid for totalview */ totalview_jobid = NULL; xstrfmtcat(totalview_jobid, "%u", job->ctx_params.job_id); /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return -1; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return -1; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.mem_per_cpu = opt.mem_per_cpu; job->ctx_params.gres = opt.gres; if (use_all_cpus) job->ctx_params.cpu_count = job->cpu_count; else if (opt.overcommit) job->ctx_params.cpu_count = job->ctx_params.min_nodes; else job->ctx_params.cpu_count = opt.ntasks*opt.cpus_per_task; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.gres = opt.gres; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; switch (opt.distribution) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: job->ctx_params.task_dist = opt.distribution; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: job->ctx_params.task_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution = job->ctx_params.task_dist; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!destroy_job); i++) { if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else job->step_ctx = slurm_step_ctx_create( &job->ctx_params); if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return -1; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); my_sleep = MIN((my_sleep * 2), 29000000); } /* sleep 0.1 to 29 secs with exponential back-off */ usleep(my_sleep); if (destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (destroy_job) { info("Cancelled pending job step"); return -1; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return 0; }
resource_allocation_response_msg_t * allocate_nodes(void) { resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j = job_desc_msg_create_from_opts(); slurm_allocation_callbacks_t callbacks; int i; if (!j) return NULL; /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt.jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt.jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); while (!resp) { resp = slurm_allocate_resources_blocking(j, opt.immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; #ifdef HAVE_BG if (!_wait_bluegene_block_ready(resp)) { if(!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else if (!_wait_nodes_ready(resp)) { if(!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: slurm_free_resource_allocation_response_msg(resp); if (!destroy_job) slurm_complete_job(resp->job_id, 1); exit(error_exit); return NULL; }
/* main - slurmctld main function, start various threads and process RPCs */ int main(int argc, char *argv[]) { pthread_attr_t thread_attr; char node_name[128]; void *db_conn = NULL; assoc_init_args_t assoc_init_arg; _init_config(); log_init(argv[0], log_opts, LOG_DAEMON, NULL); if (read_slurmdbd_conf()) exit(1); _parse_commandline(argc, argv); _update_logging(true); _update_nice(); if (slurm_auth_init(NULL) != SLURM_SUCCESS) { fatal("Unable to initialize %s authentication plugin", slurmdbd_conf->auth_type); } if (slurm_acct_storage_init(NULL) != SLURM_SUCCESS) { fatal("Unable to initialize %s accounting storage plugin", slurmdbd_conf->storage_type); } _kill_old_slurmdbd(); if (foreground == 0) _daemonize(); /* * Need to create pidfile here in case we setuid() below * (init_pidfile() exits if it can't initialize pid file). * On Linux we also need to make this setuid job explicitly * able to write a core dump. * This also has to happen after daemon(), which closes all fd's, * so we keep the write lock of the pidfile. */ _init_pidfile(); _become_slurm_user(); if (foreground == 0) _set_work_dir(); log_config(); #ifdef PR_SET_DUMPABLE if (prctl(PR_SET_DUMPABLE, 1) < 0) debug ("Unable to set dumpable to 1"); #endif /* PR_SET_DUMPABLE */ if (xsignal_block(dbd_sigarray) < 0) error("Unable to block signals"); /* Create attached thread for signal handling */ slurm_attr_init(&thread_attr); if (pthread_create(&signal_handler_thread, &thread_attr, _signal_handler, NULL)) fatal("pthread_create %m"); slurm_attr_destroy(&thread_attr); registered_clusters = list_create(NULL); slurm_attr_init(&thread_attr); if (pthread_create(&commit_handler_thread, &thread_attr, _commit_handler, NULL)) fatal("pthread_create %m"); slurm_attr_destroy(&thread_attr); memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t)); /* If we are tacking wckey we need to cache wckeys, if we aren't only cache the users, qos */ assoc_init_arg.cache_level = ASSOC_MGR_CACHE_USER | ASSOC_MGR_CACHE_QOS; if (slurmdbd_conf->track_wckey) assoc_init_arg.cache_level |= ASSOC_MGR_CACHE_WCKEY; db_conn = acct_storage_g_get_connection(NULL, 0, true, NULL); if (assoc_mgr_init(db_conn, &assoc_init_arg, errno) == SLURM_ERROR) { error("Problem getting cache of data"); acct_storage_g_close_connection(&db_conn); goto end_it; } if (gethostname_short(node_name, sizeof(node_name))) fatal("getnodename: %m"); while (1) { if (slurmdbd_conf->dbd_backup && (!strcmp(node_name, slurmdbd_conf->dbd_backup) || !strcmp(slurmdbd_conf->dbd_backup, "localhost"))) { info("slurmdbd running in background mode"); have_control = false; backup = true; /* make sure any locks are released */ acct_storage_g_commit(db_conn, 1); run_dbd_backup(); if (!shutdown_time) assoc_mgr_refresh_lists(db_conn); } else if (slurmdbd_conf->dbd_host && (!strcmp(slurmdbd_conf->dbd_host, node_name) || !strcmp(slurmdbd_conf->dbd_host, "localhost"))) { backup = false; have_control = true; } else { fatal("This host not configured to run SlurmDBD " "(%s != %s | (backup) %s)", node_name, slurmdbd_conf->dbd_host, slurmdbd_conf->dbd_backup); } if (!shutdown_time) { /* Create attached thread to process incoming RPCs */ slurm_attr_init(&thread_attr); if (pthread_create(&rpc_handler_thread, &thread_attr, rpc_mgr, NULL)) fatal("pthread_create error %m"); slurm_attr_destroy(&thread_attr); } if (!shutdown_time) { /* Create attached thread to do usage rollup */ slurm_attr_init(&thread_attr); if (pthread_create(&rollup_handler_thread, &thread_attr, _rollup_handler, db_conn)) fatal("pthread_create error %m"); slurm_attr_destroy(&thread_attr); } /* Daemon is fully operational here */ if (!shutdown_time || primary_resumed) { shutdown_time = 0; info("slurmdbd version %s started", SLURM_VERSION_STRING); if (backup) run_dbd_backup(); } _request_registrations(db_conn); acct_storage_g_commit(db_conn, 1); /* this is only ran if not backup */ if (rollup_handler_thread) pthread_join(rollup_handler_thread, NULL); if (rpc_handler_thread) pthread_join(rpc_handler_thread, NULL); if (backup && primary_resumed) { shutdown_time = 0; info("Backup has given up control"); } if (shutdown_time) break; } /* Daemon termination handled here */ end_it: if (signal_handler_thread) pthread_join(signal_handler_thread, NULL); if (commit_handler_thread) pthread_join(commit_handler_thread, NULL); acct_storage_g_commit(db_conn, 1); acct_storage_g_close_connection(&db_conn); if (slurmdbd_conf->pid_file && (unlink(slurmdbd_conf->pid_file) < 0)) { verbose("Unable to remove pidfile '%s': %m", slurmdbd_conf->pid_file); } FREE_NULL_LIST(registered_clusters); assoc_mgr_fini(NULL); slurm_acct_storage_fini(); slurm_auth_fini(); log_fini(); free_slurmdbd_conf(); exit(0); }
resource_allocation_response_msg_t * allocate_nodes(bool handle_signals) { resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j = job_desc_msg_create_from_opts(); slurm_allocation_callbacks_t callbacks; int i; if (!j) return NULL; /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt.jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt.jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (!resp) { resp = slurm_allocate_resources_blocking(j, opt.immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. */ if (opt.pn_min_memory != NO_VAL) opt.pn_min_memory = (resp->pn_min_memory & (~MEM_PER_CPU)); else if (opt.mem_per_cpu != NO_VAL) opt.mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); /* * FIXME: timelimit should probably also be updated * here since it could also change. */ #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt.min_nodes = node_cnt; opt.max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt.min_nodes = resp->node_cnt; opt.max_nodes = resp->node_cnt; if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: if (resp) { if (!destroy_job) slurm_complete_job(resp->job_id, 1); slurm_free_resource_allocation_response_msg(resp); } exit(error_exit); return NULL; }
int main (int argc, char *argv[]) { int i, pidfd; int blocked_signals[] = {SIGPIPE, 0}; int cc; char *oom_value; uint32_t slurmd_uid = 0; uint32_t curr_uid = 0; char time_stamp[256]; log_options_t lopts = LOG_OPTS_INITIALIZER; /* NOTE: logfile is NULL at this point */ log_init(argv[0], lopts, LOG_DAEMON, NULL); /* * Make sure we have no extra open files which * would be propagated to spawned tasks. */ cc = sysconf(_SC_OPEN_MAX); for (i = 3; i < cc; i++) close(i); /* * Drop supplementary groups. */ if (geteuid() == 0) { if (setgroups(0, NULL) != 0) { fatal("Failed to drop supplementary groups, " "setgroups: %m"); } } else { debug("Not running as root. Can't drop supplementary groups"); } /* * Create and set default values for the slurmd global * config variable "conf" */ conf = xmalloc(sizeof(slurmd_conf_t)); _init_conf(); conf->argv = &argv; conf->argc = &argc; if (_slurmd_init() < 0) { error( "slurmd initialization failed" ); fflush( NULL ); exit(1); } slurmd_uid = slurm_get_slurmd_user_id(); curr_uid = getuid(); if (curr_uid != slurmd_uid) { struct passwd *pw = NULL; char *slurmd_user = NULL; char *curr_user = NULL; /* since when you do a getpwuid you get a pointer to a * structure you have to do a xstrdup on the first * call or your information will just get over * written. This is a memory leak, but a fatal is * called right after so it isn't that big of a deal. */ if ((pw=getpwuid(slurmd_uid))) slurmd_user = xstrdup(pw->pw_name); if ((pw=getpwuid(curr_uid))) curr_user = pw->pw_name; fatal("You are running slurmd as something " "other than user %s(%d). If you want to " "run as this user add SlurmdUser=%s " "to the slurm.conf file.", slurmd_user, slurmd_uid, curr_user); } init_setproctitle(argc, argv); xsignal(SIGTERM, &_term_handler); xsignal(SIGINT, &_term_handler); xsignal(SIGHUP, &_hup_handler ); xsignal_block(blocked_signals); debug3("slurmd initialization successful"); /* * Become a daemon if desired. * Do not chdir("/") or close all fd's */ if (conf->daemonize) { if (daemon(1,1) == -1) error("Couldn't daemonize slurmd: %m"); } test_core_limit(); info("slurmd version %s started", SLURM_VERSION_STRING); debug3("finished daemonize"); if ((oom_value = getenv("SLURMD_OOM_ADJ"))) { i = atoi(oom_value); debug("Setting slurmd oom_adj to %d", i); set_oom_adj(i); } _kill_old_slurmd(); if (conf->mlock_pages) { /* * Call mlockall() if available to ensure slurmd * doesn't get swapped out */ #ifdef _POSIX_MEMLOCK if (mlockall (MCL_FUTURE | MCL_CURRENT) < 0) error ("failed to mlock() slurmd pages: %m"); #else error ("mlockall() system call does not appear to be available"); #endif /* _POSIX_MEMLOCK */ } /* * Restore any saved revoked credential information */ if (!conf->cleanstart && (_restore_cred_state(conf->vctx) < 0)) return SLURM_FAILURE; if (jobacct_gather_init() != SLURM_SUCCESS) fatal("Unable to initialize jobacct_gather"); if (job_container_init() < 0) fatal("Unable to initialize job_container plugin."); if (container_g_restore(conf->spooldir, !conf->cleanstart)) error("Unable to restore job_container state."); if (switch_g_node_init() < 0) fatal("Unable to initialize interconnect."); if (conf->cleanstart && switch_g_clear_node_state()) fatal("Unable to clear interconnect state."); switch_g_slurmd_init(); _create_msg_socket(); conf->pid = getpid(); /* This has to happen after daemon(), which closes all fd's, so we keep the write lock of the pidfile. */ pidfd = create_pidfile(conf->pidfile, 0); rfc2822_timestamp(time_stamp, sizeof(time_stamp)); info("%s started on %s", slurm_prog_name, time_stamp); _install_fork_handlers(); list_install_fork_handlers(); slurm_conf_install_fork_handlers(); /* * Initialize any plugins */ if (slurmd_plugstack_init()) fatal("failed to initialize slurmd_plugstack"); _spawn_registration_engine(); _msg_engine(); /* * Close fd here, otherwise we'll deadlock since create_pidfile() * flocks the pidfile. */ if (pidfd >= 0) /* valid pidfd, non-error */ (void) close(pidfd); /* Ignore errors */ if (unlink(conf->pidfile) < 0) error("Unable to remove pidfile `%s': %m", conf->pidfile); _wait_for_all_threads(120); _slurmd_fini(); _destroy_conf(); slurm_crypto_fini(); /* must be after _destroy_conf() */ info("Slurmd shutdown completing"); log_fini(); return 0; }
int main (int argc, char *argv[]) { slurm_addr_t *cli; slurm_addr_t *self; slurm_msg_t *msg; slurmd_job_t *job; int ngids; gid_t *gids; int rc = 0; if (process_cmdline (argc, argv) < 0) fatal ("Error in slurmstepd command line"); xsignal_block(slurmstepd_blocked_signals); conf = xmalloc(sizeof(*conf)); conf->argv = &argv; conf->argc = &argc; init_setproctitle(argc, argv); if (slurm_select_init(1) != SLURM_SUCCESS ) fatal( "failed to initialize node selection plugin" ); /* Receive job parameters from the slurmd */ _init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg, &ngids, &gids); /* Fancy way of closing stdin that keeps STDIN_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDIN_FILENO); /* Create the slurmd_job_t, mostly from info in a launch_tasks_request_msg_t or a batch_job_launch_msg_t */ if(!(job = _step_setup(cli, self, msg))) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } job->ngids = ngids; job->gids = gids; /* fork handlers cause mutexes on some global data structures to be re-initialized after the fork. */ list_install_fork_handlers(); slurm_conf_install_fork_handlers(); /* sets job->msg_handle and job->msgid */ if (msg_thr_create(job) == SLURM_ERROR) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } _send_ok_to_slurmd(STDOUT_FILENO); /* Fancy way of closing stdout that keeps STDOUT_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDOUT_FILENO); /* This does most of the stdio setup, then launches all the tasks, and blocks until the step is complete */ rc = job_manager(job); /* signal the message thread to shutdown, and wait for it */ eio_signal_shutdown(job->msg_handle); pthread_join(job->msgid, NULL); if (job->batch) batch_finish(job, rc); /* sends batch complete message */ ending: #ifdef MEMORY_LEAK_DEBUG _step_cleanup(job, msg, rc); xfree(cli); xfree(self); xfree(conf->hostname); xfree(conf->block_map); xfree(conf->block_map_inv); xfree(conf->spooldir); xfree(conf->node_name); xfree(conf->node_topo_addr); xfree(conf->node_topo_pattern); xfree(conf->logfile); xfree(conf); #endif info("done with job"); return rc; }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job, slurm_opt_t *opt_local) { srun_opt_t *srun_opt = opt_local->srun_opt; int i, j, rc; unsigned long step_wait = 0; uint16_t base_dist, slurmctld_timeout; char *add_tres; xassert(srun_opt); if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.step_id = job->stepid; job->ctx_params.uid = opt_local->uid; /* Validate minimum and maximum node counts */ if (opt_local->min_nodes && opt_local->max_nodes && (opt_local->min_nodes > opt_local->max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt_local->min_nodes, opt_local->max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END if (opt_local->min_nodes && (opt_local->min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt_local->min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt_local->min_nodes && (opt_local->min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt_local->min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt_local->max_nodes && (opt_local->max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt_local->max_nodes; if (!opt_local->ntasks_set && (opt_local->ntasks_per_node != NO_VAL)) job->ntasks = opt_local->ntasks = job->nhosts * opt_local->ntasks_per_node; job->ctx_params.task_count = opt_local->ntasks; if (opt_local->mem_per_cpu != NO_VAL64) job->ctx_params.pn_min_memory = opt_local->mem_per_cpu | MEM_PER_CPU; else if (opt_local->pn_min_memory != NO_VAL64) job->ctx_params.pn_min_memory = opt_local->pn_min_memory; if (opt_local->overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt_local->cpus_set) { job->ctx_params.cpu_count = opt_local->ntasks * opt_local->cpus_per_task; } else if (opt_local->ntasks_set) { job->ctx_params.cpu_count = opt_local->ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt_local->ntasks; } job->ctx_params.cpu_freq_min = opt_local->cpu_freq_min; job->ctx_params.cpu_freq_max = opt_local->cpu_freq_max; job->ctx_params.cpu_freq_gov = opt_local->cpu_freq_gov; job->ctx_params.relative = (uint16_t)srun_opt->relative; job->ctx_params.ckpt_interval = (uint16_t)srun_opt->ckpt_interval; job->ctx_params.ckpt_dir = srun_opt->ckpt_dir; job->ctx_params.exclusive = (uint16_t)srun_opt->exclusive; if (opt_local->immediate == 1) job->ctx_params.immediate = (uint16_t)opt_local->immediate; if (opt_local->time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt_local->time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (srun_opt->resv_port_cnt != NO_VAL) { job->ctx_params.resv_port_cnt = (uint16_t)srun_opt->resv_port_cnt; } else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (srun_opt->multi_prog ? srun_opt->multi_prog_cmds + 1 : 1); #endif } switch (opt_local->distribution & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt_local->distribution; if (opt_local->ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt_local->ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt_local->plane_size; break; default: /* Leave distribution set to unknown if taskcount <= nodes and * memory is set to 0. step_mgr will handle the 0mem case. * ex. SallocDefaultCommand=srun -n1 -N1 --mem=0 ... */ if (!opt_local->mem_per_cpu || !opt_local->pn_min_memory) base_dist = SLURM_DIST_UNKNOWN; else base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt_local->distribution &= SLURM_DIST_STATE_FLAGS; opt_local->distribution |= base_dist; job->ctx_params.task_dist = opt_local->distribution; if (opt_local->ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt_local->ntasks_per_node; break; } job->ctx_params.overcommit = opt_local->overcommit ? 1 : 0; job->ctx_params.node_list = opt_local->nodelist; job->ctx_params.network = opt_local->network; job->ctx_params.no_kill = opt_local->no_kill; if (srun_opt->job_name_set_cmd && opt_local->job_name) job->ctx_params.name = opt_local->job_name; else job->ctx_params.name = srun_opt->cmd_name; job->ctx_params.features = opt_local->constraints; if (opt_local->cpus_per_gpu) { xstrfmtcat(job->ctx_params.cpus_per_tres, "gpu:%d", opt_local->cpus_per_gpu); } xfree(opt_local->tres_bind); /* Vestigial value from job allocate */ if (opt_local->gpu_bind) xstrfmtcat(opt_local->tres_bind, "gpu:%s", opt_local->gpu_bind); if (tres_bind_verify_cmdline(opt_local->tres_bind)) { if (tres_bind_err_log) { /* Log once */ error("Invalid --tres-bind argument: %s. Ignored", opt_local->tres_bind); tres_bind_err_log = false; } xfree(opt_local->tres_bind); } job->ctx_params.tres_bind = xstrdup(opt_local->tres_bind); xfree(opt_local->tres_freq); /* Vestigial value from job allocate */ xfmt_tres_freq(&opt_local->tres_freq, "gpu", opt_local->gpu_freq); if (tres_freq_verify_cmdline(opt_local->tres_freq)) { if (tres_freq_err_log) { /* Log once */ error("Invalid --tres-freq argument: %s. Ignored", opt_local->tres_freq); tres_freq_err_log = false; } xfree(opt_local->tres_freq); } job->ctx_params.tres_freq = xstrdup(opt_local->tres_freq); job->ctx_params.tres_per_step = xstrdup(opt_local->tres_per_job); xfmt_tres(&job->ctx_params.tres_per_step, "gpu", opt_local->gpus); xfmt_tres(&job->ctx_params.tres_per_node, "gpu", opt_local->gpus_per_node); if (opt_local->gres) add_tres = opt_local->gres; else add_tres = getenv("SLURM_STEP_GRES"); if (add_tres) { if (job->ctx_params.tres_per_node) { xstrfmtcat(job->ctx_params.tres_per_node, ",%s", add_tres); } else job->ctx_params.tres_per_node = xstrdup(add_tres); } xfmt_tres(&job->ctx_params.tres_per_socket, "gpu", opt_local->gpus_per_socket); xfmt_tres(&job->ctx_params.tres_per_task, "gpu", opt_local->gpus_per_task); if (opt_local->mem_per_gpu) { xstrfmtcat(job->ctx_params.mem_per_tres, "gpu:%"PRIi64, opt.mem_per_gpu); } debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); for (i = 0; (!(*destroy_job)); i++) { if (srun_opt->no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else { if (opt_local->immediate) { step_wait = MAX(1, opt_local->immediate - difftime(time(NULL), srun_begin_time)) * 1000; } else { slurmctld_timeout = MIN(300, MAX(60, slurm_get_slurmctld_timeout())); step_wait = ((getpid() % 10) + slurmctld_timeout) * 1000; } job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) { info("Step created for job %u", job->ctx_params.job_id); } break; } rc = slurm_get_errno(); if (((opt_local->immediate != 0) && ((opt_local->immediate == 1) || (difftime(time(NULL), srun_begin_time) >= opt_local->immediate))) || ((rc != ESLURM_PROLOG_RUNNING) && !slurm_step_retry_errno(rc))) { error("Unable to create step for job %u: %m", job->ctx_params.job_id); return SLURM_ERROR; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job %u step creation temporarily disabled, retrying", job->ctx_params.job_id); } xsignal_unblock(sig_array); for (j = 0; sig_array[j]; j++) xsignal(sig_array[j], signal_function); } else { verbose("Job %u step creation still disabled, retrying", job->ctx_params.job_id); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending step for job %u", job->ctx_params.job_id); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* * Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job, opt_local); return SLURM_SUCCESS; }
/* * Allocate nodes for heterogeneous/pack job from the slurm controller -- * retrying the attempt if the controller appears to be down, and optionally * waiting for resources if none are currently available (see opt.immediate) * * Returns a pointer to a resource_allocation_response_msg which must * be freed with slurm_free_resource_allocation_response_msg() */ List allocate_pack_nodes(bool handle_signals) { resource_allocation_response_msg_t *resp = NULL; bool jobid_log = true; job_desc_msg_t *j, *first_job = NULL; slurm_allocation_callbacks_t callbacks; ListIterator opt_iter, resp_iter; slurm_opt_t *opt_local, *first_opt = NULL; List job_req_list = NULL, job_resp_list = NULL; uint32_t my_job_id = 0; int i, k; job_req_list = list_create(NULL); opt_iter = list_iterator_create(opt_list); while ((opt_local = list_next(opt_iter))) { srun_opt_t *srun_opt = opt_local->srun_opt; xassert(srun_opt); if (!first_opt) first_opt = opt_local; if (srun_opt->relative_set && srun_opt->relative) fatal("--relative option invalid for job allocation request"); if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL) return NULL; if (!first_job) first_job = j; j->origin_cluster = xstrdup(slurmctld_conf.cluster_name); /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt_local->jobid_set) { if (jobid_log) { jobid_log = false; /* log once */ info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); } if (!opt_local->jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } list_append(job_req_list, j); } list_iterator_destroy(opt_iter); if (!first_job) { error("%s: No job requests found", __func__); return NULL; } if (first_opt && first_opt->clusters && (slurmdb_get_first_pack_cluster(job_req_list, first_opt->clusters, &working_cluster_rec) != SLURM_SUCCESS)) { print_db_notok(first_opt->clusters, 0); return NULL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (first_opt && !job_resp_list) { job_resp_list = slurm_allocate_pack_job_blocking(job_req_list, first_opt->immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!job_resp_list && !_retry()) { break; } } if (job_resp_list && !destroy_job) { /* * Allocation granted! */ opt_iter = list_iterator_create(opt_list); resp_iter = list_iterator_create(job_resp_list); while ((opt_local = list_next(opt_iter))) { resp = (resource_allocation_response_msg_t *) list_next(resp_iter); if (!resp) break; if (pending_job_id == 0) pending_job_id = resp->job_id; if (my_job_id == 0) { my_job_id = resp->job_id; i = list_count(opt_list); k = list_count(job_resp_list); if (i != k) { error("%s: request count != response count (%d != %d)", __func__, i, k); goto relinquish; } } /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. * * NOTE: pn_min_memory here is an int64, not uint64. * These operations may have some bizarre side effects */ if (opt_local->pn_min_memory != NO_VAL64) opt_local->pn_min_memory = (resp->pn_min_memory & (~MEM_PER_CPU)); else if (opt_local->mem_per_cpu != NO_VAL64) opt_local->mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt_local->min_nodes = node_cnt; opt_local->max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt_local->min_nodes = resp->node_cnt; opt_local->max_nodes = resp->node_cnt; if (resp->working_cluster_rec) slurm_setup_remote_working_cluster(resp); if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } list_iterator_destroy(resp_iter); list_iterator_destroy(opt_iter); } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); return job_resp_list; relinquish: if (job_resp_list) { if (!destroy_job && my_job_id) slurm_complete_job(my_job_id, 1); list_destroy(job_resp_list); } exit(error_exit); return NULL; }
/* * Allocate nodes from the slurm controller -- retrying the attempt * if the controller appears to be down, and optionally waiting for * resources if none are currently available (see opt.immediate) * * Returns a pointer to a resource_allocation_response_msg which must * be freed with slurm_free_resource_allocation_response_msg() */ extern resource_allocation_response_msg_t * allocate_nodes(bool handle_signals, slurm_opt_t *opt_local) { srun_opt_t *srun_opt = opt_local->srun_opt; resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j; slurm_allocation_callbacks_t callbacks; int i; xassert(srun_opt); if (srun_opt->relative_set && srun_opt->relative) fatal("--relative option invalid for job allocation request"); if ((j = _job_desc_msg_create_from_opts(&opt)) == NULL) return NULL; if (opt_local->clusters && (slurmdb_get_first_avail_cluster(j, opt_local->clusters, &working_cluster_rec) != SLURM_SUCCESS)) { print_db_notok(opt_local->clusters, 0); return NULL; } j->origin_cluster = xstrdup(slurmctld_conf.cluster_name); /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt_local->jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt_local->jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (!resp) { resp = slurm_allocate_resources_blocking(j, opt_local->immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp) print_multi_line_string(resp->job_submit_user_msg, -1); if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. */ opt_local->pn_min_memory = NO_VAL64; opt_local->mem_per_cpu = NO_VAL64; if (resp->pn_min_memory != NO_VAL64) { if (resp->pn_min_memory & MEM_PER_CPU) { opt_local->mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); } else { opt_local->pn_min_memory = resp->pn_min_memory; } } #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt_local->min_nodes = node_cnt; opt_local->max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt_local->min_nodes = resp->node_cnt; opt_local->max_nodes = resp->node_cnt; if (resp->working_cluster_rec) slurm_setup_remote_working_cluster(resp); if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: if (resp) { if (!destroy_job) slurm_complete_job(resp->job_id, 1); slurm_free_resource_allocation_response_msg(resp); } exit(error_exit); return NULL; }
int main (int argc, char **argv) { slurm_addr_t *cli; slurm_addr_t *self; slurm_msg_t *msg; stepd_step_rec_t *job; int ngids; gid_t *gids; int rc = 0; char *launch_params; if (_process_cmdline (argc, argv) < 0) fatal ("Error in slurmstepd command line"); xsignal_block(slurmstepd_blocked_signals); conf = xmalloc(sizeof(*conf)); conf->argv = &argv; conf->argc = &argc; init_setproctitle(argc, argv); if (slurm_select_init(1) != SLURM_SUCCESS ) fatal( "failed to initialize node selection plugin" ); if (slurm_auth_init(NULL) != SLURM_SUCCESS) fatal( "failed to initialize authentication plugin" ); /* Receive job parameters from the slurmd */ _init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg, &ngids, &gids); /* Create the stepd_step_rec_t, mostly from info in a * launch_tasks_request_msg_t or a batch_job_launch_msg_t */ if (!(job = _step_setup(cli, self, msg))) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } job->ngids = ngids; job->gids = gids; /* fork handlers cause mutexes on some global data structures * to be re-initialized after the fork. */ list_install_fork_handlers(); slurm_conf_install_fork_handlers(); /* sets job->msg_handle and job->msgid */ if (msg_thr_create(job) == SLURM_ERROR) { _send_fail_to_slurmd(STDOUT_FILENO); rc = SLURM_FAILURE; goto ending; } _send_ok_to_slurmd(STDOUT_FILENO); _got_ack_from_slurmd(STDIN_FILENO); /* Fancy way of closing stdin that keeps STDIN_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDIN_FILENO); /* Fancy way of closing stdout that keeps STDOUT_FILENO from being * allocated to any random file. The slurmd already opened /dev/null * on STDERR_FILENO for us. */ dup2(STDERR_FILENO, STDOUT_FILENO); /* slurmstepd is the only daemon that should survive upgrade. If it * had been swapped out before upgrade happened it could easily lead * to SIGBUS at any time after upgrade. Avoid that by locking it * in-memory. */ launch_params = slurm_get_launch_params(); if (launch_params && strstr(launch_params, "slurmstepd_memlock")) { #ifdef _POSIX_MEMLOCK int flags = MCL_CURRENT; if (strstr(launch_params, "slurmstepd_memlock_all")) flags |= MCL_FUTURE; if (mlockall(flags) < 0) info("failed to mlock() slurmstepd pages: %m"); else debug("slurmstepd locked in memory"); #else info("mlockall() system call does not appear to be available"); #endif } xfree(launch_params); /* This does most of the stdio setup, then launches all the tasks, * and blocks until the step is complete */ rc = job_manager(job); return stepd_cleanup(msg, job, cli, self, rc, 0); ending: return stepd_cleanup(msg, job, cli, self, rc, 1); }