static void *_pty_thread(void *arg) { int fd = -1; srun_job_t *job = (srun_job_t *) arg; slurm_addr_t client_addr; xsignal_unblock(pty_sigarray); xsignal(SIGWINCH, _handle_sigwinch); if ((fd = slurm_accept_msg_conn(job->pty_fd, &client_addr)) < 0) { error("pty: accept failure: %m"); return NULL; } while (job->state <= SRUN_JOB_RUNNING) { debug2("waiting for SIGWINCH"); poll(NULL, 0, -1); if (winch) { set_winsize(job); _notify_winsize_change(fd, job); } winch = 0; } return NULL; }
extern void slurm_persist_conn_recv_server_init(void) { int sigarray[] = {SIGUSR1, 0}; shutdown_time = 0; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); /* Prepare to catch SIGUSR1 to interrupt accept(). * This signal is generated by the slurmdbd signal * handler thread upon receipt of SIGABRT, SIGINT, * or SIGTERM. That thread does all processing of * all signals. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job) { int i, rc; unsigned long step_wait = 0, my_sleep = 0; time_t begin_time; uint16_t base_dist; if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU; else if (opt.pn_min_memory != NO_VAL) job->ctx_params.pn_min_memory = opt.pn_min_memory; if (opt.gres) job->ctx_params.gres = opt.gres; else job->ctx_params.gres = getenv("SLURM_STEP_GRES"); if (opt.overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt.cpus_set) { job->ctx_params.cpu_count = opt.ntasks * opt.cpus_per_task; } else if (opt.ntasks_set) { job->ctx_params.cpu_count = opt.ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt.ntasks; } job->ctx_params.cpu_freq_min = opt.cpu_freq_min; job->ctx_params.cpu_freq_max = opt.cpu_freq_max; job->ctx_params.cpu_freq_gov = opt.cpu_freq_gov; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (opt.multi_prog ? opt.multi_prog_cmds + 1 : 1); #endif } switch (opt.distribution & SLURM_DIST_STATE_BASE) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution &= SLURM_DIST_STATE_FLAGS; opt.distribution |= base_dist; job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; job->ctx_params.features = opt.constraints; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!(*destroy_job)); i++) { bool blocking_step_create = true; if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else if (opt.immediate) { job->step_ctx = slurm_step_ctx_create( &job->ctx_params); } else { /* Wait 60 to 70 seconds for response */ step_wait = (getpid() % 10) * 1000 + 60000; job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_INTERCONNECT_BUSY) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return SLURM_ERROR; } if (rc == ESLURM_DISABLED) /* job suspended */ blocking_step_create = false; if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], signal_function); if (!blocking_step_create) my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); if (!blocking_step_create) my_sleep *= 2; } if (!blocking_step_create) { /* sleep 0.1 to 29 secs with exponential back-off */ my_sleep = MIN(my_sleep, 29000000); usleep(my_sleep); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending job step"); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return SLURM_SUCCESS; }
/* _background_rpc_mgr - Read and process incoming RPCs to the background * controller (that's us) */ static void *_background_rpc_mgr(void *no_data) { slurm_fd_t newsockfd; slurm_fd_t sockfd; slurm_addr_t cli_addr; slurm_msg_t *msg = NULL; int error_code; char* node_addr = NULL; /* Read configuration only */ slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; int sigarray[] = {SIGUSR1, 0}; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); debug3("_background_rpc_mgr pid = %lu", (unsigned long) getpid()); /* initialize port for RPCs */ lock_slurmctld(config_read_lock); /* set node_addr to bind to (NULL means any) */ if ((strcmp(slurmctld_conf.backup_controller, slurmctld_conf.backup_addr) != 0)) { node_addr = slurmctld_conf.backup_addr ; } if ((sockfd = slurm_init_msg_engine_addrname_port(node_addr, slurmctld_conf. slurmctld_port)) == SLURM_SOCKET_ERROR) fatal("slurm_init_msg_engine_addrname_port error %m"); unlock_slurmctld(config_read_lock); /* Prepare to catch SIGUSR1 to interrupt accept(). * This signal is generated by the slurmctld signal * handler thread upon receipt of SIGABRT, SIGINT, * or SIGTERM. That thread does all processing of * all signals. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); /* * Process incoming RPCs indefinitely */ while (slurmctld_config.shutdown_time == 0) { /* accept needed for stream implementation * is a no-op in message implementation that just passes * sockfd to newsockfd */ if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr)) == SLURM_SOCKET_ERROR) { if (errno != EINTR) error("slurm_accept_msg_conn: %m"); continue; } msg = xmalloc(sizeof(slurm_msg_t)); slurm_msg_t_init(msg); if (slurm_receive_msg(newsockfd, msg, 0) != 0) error("slurm_receive_msg: %m"); error_code = _background_process_msg(msg); if ((error_code == SLURM_SUCCESS) && (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE) && (slurmctld_config.shutdown_time == 0)) slurmctld_config.shutdown_time = time(NULL); slurm_free_msg_data(msg->msg_type, msg->data); slurm_free_msg(msg); slurm_close(newsockfd); /* close new socket */ } debug3("_background_rpc_mgr shutting down"); slurm_close(sockfd); /* close the main socket */ pthread_exit((void *) 0); return NULL; }
extern int create_job_step(srun_job_t *job, bool use_all_cpus) { int i, rc; unsigned long my_sleep = 0; time_t begin_time; slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* set the jobid for totalview */ totalview_jobid = NULL; xstrfmtcat(totalview_jobid, "%u", job->ctx_params.job_id); /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return -1; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return -1; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.mem_per_cpu = opt.mem_per_cpu; job->ctx_params.gres = opt.gres; if (use_all_cpus) job->ctx_params.cpu_count = job->cpu_count; else if (opt.overcommit) job->ctx_params.cpu_count = job->ctx_params.min_nodes; else job->ctx_params.cpu_count = opt.ntasks*opt.cpus_per_task; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.gres = opt.gres; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; switch (opt.distribution) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: job->ctx_params.task_dist = opt.distribution; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: job->ctx_params.task_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution = job->ctx_params.task_dist; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!destroy_job); i++) { if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else job->step_ctx = slurm_step_ctx_create( &job->ctx_params); if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return -1; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); my_sleep = MIN((my_sleep * 2), 29000000); } /* sleep 0.1 to 29 secs with exponential back-off */ usleep(my_sleep); if (destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (destroy_job) { info("Cancelled pending job step"); return -1; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return 0; }
resource_allocation_response_msg_t * allocate_nodes(void) { resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j = job_desc_msg_create_from_opts(); slurm_allocation_callbacks_t callbacks; int i; if (!j) return NULL; /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt.jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt.jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); while (!resp) { resp = slurm_allocate_resources_blocking(j, opt.immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; #ifdef HAVE_BG if (!_wait_bluegene_block_ready(resp)) { if(!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else if (!_wait_nodes_ready(resp)) { if(!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: slurm_free_resource_allocation_response_msg(resp); if (!destroy_job) slurm_complete_job(resp->job_id, 1); exit(error_exit); return NULL; }
resource_allocation_response_msg_t * allocate_nodes(bool handle_signals) { resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j = job_desc_msg_create_from_opts(); slurm_allocation_callbacks_t callbacks; int i; if (!j) return NULL; /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt.jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt.jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (!resp) { resp = slurm_allocate_resources_blocking(j, opt.immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. */ if (opt.pn_min_memory != NO_VAL) opt.pn_min_memory = (resp->pn_min_memory & (~MEM_PER_CPU)); else if (opt.mem_per_cpu != NO_VAL) opt.mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); /* * FIXME: timelimit should probably also be updated * here since it could also change. */ #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt.min_nodes = node_cnt; opt.max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt.min_nodes = resp->node_cnt; opt.max_nodes = resp->node_cnt; if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: if (resp) { if (!destroy_job) slurm_complete_job(resp->job_id, 1); slurm_free_resource_allocation_response_msg(resp); } exit(error_exit); return NULL; }
/* Process incoming RPCs. Meant to execute as a pthread */ extern void *rpc_mgr(void *no_data) { pthread_attr_t thread_attr_rpc_req; slurm_fd_t sockfd, newsockfd; int i, retry_cnt, sigarray[] = {SIGUSR1, 0}; slurm_addr_t cli_addr; slurmdbd_conn_t *conn_arg = NULL; slurm_mutex_lock(&thread_count_lock); master_thread_id = pthread_self(); slurm_mutex_unlock(&thread_count_lock); (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); /* threads to process individual RPC's are detached */ slurm_attr_init(&thread_attr_rpc_req); if (pthread_attr_setdetachstate (&thread_attr_rpc_req, PTHREAD_CREATE_DETACHED)) fatal("pthread_attr_setdetachstate %m"); /* initialize port for RPCs */ if ((sockfd = slurm_init_msg_engine_port(get_dbd_port())) == SLURM_SOCKET_ERROR) fatal("slurm_init_msg_engine_port error %m"); /* Prepare to catch SIGUSR1 to interrupt accept(). * This signal is generated by the slurmdbd signal * handler thread upon receipt of SIGABRT, SIGINT, * or SIGTERM. That thread does all processing of * all signals. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); /* * Process incoming RPCs until told to shutdown */ while ((i = _wait_for_server_thread()) >= 0) { /* * accept needed for stream implementation is a no-op in * message implementation that just passes sockfd to newsockfd */ if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr)) == SLURM_SOCKET_ERROR) { _free_server_thread((pthread_t) 0); if (errno != EINTR) error("slurm_accept_msg_conn: %m"); continue; } fd_set_nonblocking(newsockfd); conn_arg = xmalloc(sizeof(slurmdbd_conn_t)); conn_arg->newsockfd = newsockfd; slurm_get_ip_str(&cli_addr, &conn_arg->orig_port, conn_arg->ip, sizeof(conn_arg->ip)); retry_cnt = 0; while (pthread_create(&slave_thread_id[i], &thread_attr_rpc_req, _service_connection, (void *) conn_arg)) { if (retry_cnt > 0) { error("pthread_create failure, " "aborting RPC: %m"); close(newsockfd); break; } error("pthread_create failure: %m"); retry_cnt++; usleep(1000); /* retry in 1 msec */ } } debug3("rpc_mgr shutting down"); slurm_attr_destroy(&thread_attr_rpc_req); (void) slurm_shutdown_msg_engine(sockfd); _wait_for_thread_fini(); pthread_exit((void *) 0); return NULL; }
/* * _thread_per_group_rpc - thread to issue an RPC for a group of nodes * sending message out to one and forwarding it to * others if necessary. * IN/OUT args - pointer to task_info_t, xfree'd on completion */ static void *_thread_per_group_rpc(void *args) { int rc = SLURM_SUCCESS; slurm_msg_t msg; task_info_t *task_ptr = (task_info_t *) args; /* we cache some pointers from task_info_t because we need * to xfree args before being finished with their use. xfree * is required for timely termination of this pthread because * xfree could lock it at the end, preventing a timely * thread_exit */ pthread_mutex_t *thread_mutex_ptr = task_ptr->thread_mutex_ptr; pthread_cond_t *thread_cond_ptr = task_ptr->thread_cond_ptr; uint32_t *threads_active_ptr = task_ptr->threads_active_ptr; thd_t *thread_ptr = task_ptr->thread_struct_ptr; state_t thread_state = DSH_NO_RESP; slurm_msg_type_t msg_type = task_ptr->msg_type; bool is_kill_msg, srun_agent; List ret_list = NULL; ListIterator itr; ret_data_info_t *ret_data_info = NULL; int found = 0; int sig_array[2] = {SIGUSR1, 0}; /* Locks: Write job, write node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; xassert(args != NULL); xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sig_array); is_kill_msg = ( (msg_type == REQUEST_KILL_TIMELIMIT) || (msg_type == REQUEST_TERMINATE_JOB) ); srun_agent = ( (msg_type == SRUN_PING) || (msg_type == SRUN_EXEC) || (msg_type == SRUN_JOB_COMPLETE) || (msg_type == SRUN_STEP_MISSING) || (msg_type == SRUN_TIMEOUT) || (msg_type == SRUN_USER_MSG) || (msg_type == RESPONSE_RESOURCE_ALLOCATION) || (msg_type == SRUN_NODE_FAIL) ); thread_ptr->start_time = time(NULL); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->state = DSH_ACTIVE; thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT; slurm_mutex_unlock(thread_mutex_ptr); /* send request message */ slurm_msg_t_init(&msg); msg.msg_type = msg_type; msg.data = task_ptr->msg_args_ptr; #if 0 info("sending message type %u to %s", msg_type, thread_ptr->nodelist); #endif if (task_ptr->get_reply) { if(thread_ptr->addr) { msg.address = *thread_ptr->addr; if(!(ret_list = slurm_send_addr_recv_msgs( &msg, thread_ptr->nodelist, 0))) { error("_thread_per_group_rpc: " "no ret_list given"); goto cleanup; } } else { if(!(ret_list = slurm_send_recv_msgs( thread_ptr->nodelist, &msg, 0, true))) { error("_thread_per_group_rpc: " "no ret_list given"); goto cleanup; } } } else { if(thread_ptr->addr) { //info("got the address"); msg.address = *thread_ptr->addr; } else { //info("no address given"); if(slurm_conf_get_addr(thread_ptr->nodelist, &msg.address) == SLURM_ERROR) { error("_thread_per_group_rpc: " "can't find address for host %s, " "check slurm.conf", thread_ptr->nodelist); goto cleanup; } } //info("sending %u to %s", msg_type, thread_ptr->nodelist); if (slurm_send_only_node_msg(&msg) == SLURM_SUCCESS) { thread_state = DSH_DONE; } else { if (!srun_agent) _comm_err(thread_ptr->nodelist, msg_type); } goto cleanup; } //info("got %d messages back", list_count(ret_list)); found = 0; itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr)) != NULL) { rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); /* SPECIAL CASE: Mark node as IDLE if job already complete */ if (is_kill_msg && (rc == ESLURMD_KILL_JOB_ALREADY_COMPLETE)) { kill_job_msg_t *kill_job; kill_job = (kill_job_msg_t *) task_ptr->msg_args_ptr; rc = SLURM_SUCCESS; lock_slurmctld(job_write_lock); if (job_epilog_complete(kill_job->job_id, ret_data_info-> node_name, rc)) run_scheduler = true; unlock_slurmctld(job_write_lock); } /* SPECIAL CASE: Kill non-startable batch job, * Requeue the job on ESLURMD_PROLOG_FAILED */ if ((msg_type == REQUEST_BATCH_JOB_LAUNCH) && (rc != SLURM_SUCCESS) && (rc != ESLURMD_PROLOG_FAILED) && (ret_data_info->type != RESPONSE_FORWARD_FAILED)) { batch_job_launch_msg_t *launch_msg_ptr = task_ptr->msg_args_ptr; uint32_t job_id = launch_msg_ptr->job_id; info("Killing non-startable batch job %u: %s", job_id, slurm_strerror(rc)); thread_state = DSH_DONE; ret_data_info->err = thread_state; lock_slurmctld(job_write_lock); job_complete(job_id, 0, false, false, _wif_status()); unlock_slurmctld(job_write_lock); continue; } if (((msg_type == REQUEST_SIGNAL_TASKS) || (msg_type == REQUEST_TERMINATE_TASKS)) && (rc == ESRCH)) { /* process is already dead, not a real error */ rc = SLURM_SUCCESS; } switch (rc) { case SLURM_SUCCESS: /* debug("agent processed RPC to node %s", */ /* ret_data_info->node_name); */ thread_state = DSH_DONE; break; case SLURM_UNKNOWN_FORWARD_ADDR: error("We were unable to forward message to '%s'. " "Make sure the slurm.conf for each slurmd " "contain all other nodes in your system.", ret_data_info->node_name); thread_state = DSH_NO_RESP; break; case ESLURMD_EPILOG_FAILED: error("Epilog failure on host %s, " "setting DOWN", ret_data_info->node_name); thread_state = DSH_FAILED; break; case ESLURMD_PROLOG_FAILED: thread_state = DSH_FAILED; break; case ESLURM_INVALID_JOB_ID: /* Not indicative of a real error */ case ESLURMD_JOB_NOTRUNNING: /* Not indicative of a real error */ debug2("agent processed RPC to node %s: %s", ret_data_info->node_name, slurm_strerror(rc)); thread_state = DSH_DONE; break; default: if (!srun_agent) { if (ret_data_info->err) errno = ret_data_info->err; else errno = rc; rc = _comm_err(ret_data_info->node_name, msg_type); } if (srun_agent) thread_state = DSH_FAILED; else if(ret_data_info->type == RESPONSE_FORWARD_FAILED) /* check if a forward failed */ thread_state = DSH_NO_RESP; else { /* some will fail that don't mean anything went * bad like a job term request on a job that is * already finished, we will just exit on those * cases */ thread_state = DSH_DONE; } } ret_data_info->err = thread_state; } list_iterator_destroy(itr); cleanup: xfree(args); /* handled at end of thread just in case resend is needed */ destroy_forward(&msg.forward); slurm_mutex_lock(thread_mutex_ptr); thread_ptr->ret_list = ret_list; thread_ptr->state = thread_state; thread_ptr->end_time = (time_t) difftime(time(NULL), thread_ptr->start_time); /* Signal completion so another thread can replace us */ (*threads_active_ptr)--; pthread_cond_signal(thread_cond_ptr); slurm_mutex_unlock(thread_mutex_ptr); return (void *) NULL; }
static void *_agent(void *x) { int cnt, rc; Buf buffer; struct timespec abs_time; static time_t fail_time = 0; int sigarray[] = {SIGUSR1, 0}; slurmdbd_msg_t list_req; dbd_list_msg_t list_msg; list_req.msg_type = DBD_SEND_MULT_MSG; list_req.data = &list_msg; memset(&list_msg, 0, sizeof(dbd_list_msg_t)); /* DEF_TIMERS; */ /* Prepare to catch SIGUSR1 to interrupt pending * I/O and terminate in a timely fashion. */ xsignal(SIGUSR1, _sig_handler); xsignal_unblock(sigarray); while (*slurmdbd_conn->shutdown == 0) { /* START_TIMER; */ slurm_mutex_lock(&slurmdbd_lock); if (halt_agent) slurm_cond_wait(&slurmdbd_cond, &slurmdbd_lock); if ((slurmdbd_conn->fd < 0) && (difftime(time(NULL), fail_time) >= 10)) { /* The connection to Slurm DBD is not open */ _open_slurmdbd_conn(1); if (slurmdbd_conn->fd < 0) fail_time = time(NULL); } slurm_mutex_lock(&agent_lock); if (agent_list && slurmdbd_conn->fd) cnt = list_count(agent_list); else cnt = 0; if ((cnt == 0) || (slurmdbd_conn->fd < 0) || (fail_time && (difftime(time(NULL), fail_time) < 10))) { slurm_mutex_unlock(&slurmdbd_lock); abs_time.tv_sec = time(NULL) + 10; abs_time.tv_nsec = 0; slurm_cond_timedwait(&agent_cond, &agent_lock, &abs_time); slurm_mutex_unlock(&agent_lock); continue; } else if ((cnt > 0) && ((cnt % 100) == 0)) info("slurmdbd: agent queue size %u", cnt); /* Leave item on the queue until processing complete */ if (agent_list) { int handle_agent_count = 1000; if (cnt > handle_agent_count) { int agent_count = 0; ListIterator agent_itr = list_iterator_create(agent_list); list_msg.my_list = list_create(NULL); while ((buffer = list_next(agent_itr))) { list_enqueue(list_msg.my_list, buffer); agent_count++; if (agent_count > handle_agent_count) break; } list_iterator_destroy(agent_itr); buffer = pack_slurmdbd_msg( &list_req, SLURM_PROTOCOL_VERSION); } else if (cnt > 1) { list_msg.my_list = agent_list; buffer = pack_slurmdbd_msg( &list_req, SLURM_PROTOCOL_VERSION); } else buffer = (Buf) list_peek(agent_list); } else buffer = NULL; slurm_mutex_unlock(&agent_lock); if (buffer == NULL) { slurm_mutex_unlock(&slurmdbd_lock); slurm_mutex_lock(&assoc_cache_mutex); if (slurmdbd_conn->fd >= 0 && running_cache) slurm_cond_signal(&assoc_cache_cond); slurm_mutex_unlock(&assoc_cache_mutex); continue; } /* NOTE: agent_lock is clear here, so we can add more * requests to the queue while waiting for this RPC to * complete. */ rc = slurm_persist_send_msg(slurmdbd_conn, buffer); if (rc != SLURM_SUCCESS) { if (*slurmdbd_conn->shutdown) { slurm_mutex_unlock(&slurmdbd_lock); break; } error("slurmdbd: Failure sending message: %d: %m", rc); } else if (list_msg.my_list) { rc = _handle_mult_rc_ret(); } else { rc = _get_return_code(); if (rc == EAGAIN) { if (*slurmdbd_conn->shutdown) { slurm_mutex_unlock(&slurmdbd_lock); break; } error("slurmdbd: Failure with " "message need to resend: %d: %m", rc); } } slurm_mutex_unlock(&slurmdbd_lock); slurm_mutex_lock(&assoc_cache_mutex); if (slurmdbd_conn->fd >= 0 && running_cache) slurm_cond_signal(&assoc_cache_cond); slurm_mutex_unlock(&assoc_cache_mutex); slurm_mutex_lock(&agent_lock); if (agent_list && (rc == SLURM_SUCCESS)) { /* * If we sent a mult_msg we just need to free buffer, * we don't need to requeue, just mark list_msg.my_list * as NULL as that is the sign we sent a mult_msg. */ if (list_msg.my_list) { if (list_msg.my_list != agent_list) FREE_NULL_LIST(list_msg.my_list); list_msg.my_list = NULL; } else buffer = (Buf) list_dequeue(agent_list); free_buf(buffer); fail_time = 0; } else { /* We need to free a mult_msg even on failure */ if (list_msg.my_list) { if (list_msg.my_list != agent_list) FREE_NULL_LIST(list_msg.my_list); list_msg.my_list = NULL; free_buf(buffer); } fail_time = time(NULL); } slurm_mutex_unlock(&agent_lock); /* END_TIMER; */ /* info("at the end with %s", TIME_STR); */ } slurm_mutex_lock(&agent_lock); _save_dbd_state(); FREE_NULL_LIST(agent_list); slurm_mutex_unlock(&agent_lock); return NULL; }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job, slurm_opt_t *opt_local) { srun_opt_t *srun_opt = opt_local->srun_opt; int i, j, rc; unsigned long step_wait = 0; uint16_t base_dist, slurmctld_timeout; char *add_tres; xassert(srun_opt); if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.step_id = job->stepid; job->ctx_params.uid = opt_local->uid; /* Validate minimum and maximum node counts */ if (opt_local->min_nodes && opt_local->max_nodes && (opt_local->min_nodes > opt_local->max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt_local->min_nodes, opt_local->max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END if (opt_local->min_nodes && (opt_local->min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt_local->min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt_local->min_nodes && (opt_local->min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt_local->min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt_local->max_nodes && (opt_local->max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt_local->max_nodes; if (!opt_local->ntasks_set && (opt_local->ntasks_per_node != NO_VAL)) job->ntasks = opt_local->ntasks = job->nhosts * opt_local->ntasks_per_node; job->ctx_params.task_count = opt_local->ntasks; if (opt_local->mem_per_cpu != NO_VAL64) job->ctx_params.pn_min_memory = opt_local->mem_per_cpu | MEM_PER_CPU; else if (opt_local->pn_min_memory != NO_VAL64) job->ctx_params.pn_min_memory = opt_local->pn_min_memory; if (opt_local->overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt_local->cpus_set) { job->ctx_params.cpu_count = opt_local->ntasks * opt_local->cpus_per_task; } else if (opt_local->ntasks_set) { job->ctx_params.cpu_count = opt_local->ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt_local->ntasks; } job->ctx_params.cpu_freq_min = opt_local->cpu_freq_min; job->ctx_params.cpu_freq_max = opt_local->cpu_freq_max; job->ctx_params.cpu_freq_gov = opt_local->cpu_freq_gov; job->ctx_params.relative = (uint16_t)srun_opt->relative; job->ctx_params.ckpt_interval = (uint16_t)srun_opt->ckpt_interval; job->ctx_params.ckpt_dir = srun_opt->ckpt_dir; job->ctx_params.exclusive = (uint16_t)srun_opt->exclusive; if (opt_local->immediate == 1) job->ctx_params.immediate = (uint16_t)opt_local->immediate; if (opt_local->time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt_local->time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (srun_opt->resv_port_cnt != NO_VAL) { job->ctx_params.resv_port_cnt = (uint16_t)srun_opt->resv_port_cnt; } else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (srun_opt->multi_prog ? srun_opt->multi_prog_cmds + 1 : 1); #endif } switch (opt_local->distribution & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt_local->distribution; if (opt_local->ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt_local->ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt_local->plane_size; break; default: /* Leave distribution set to unknown if taskcount <= nodes and * memory is set to 0. step_mgr will handle the 0mem case. * ex. SallocDefaultCommand=srun -n1 -N1 --mem=0 ... */ if (!opt_local->mem_per_cpu || !opt_local->pn_min_memory) base_dist = SLURM_DIST_UNKNOWN; else base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt_local->distribution &= SLURM_DIST_STATE_FLAGS; opt_local->distribution |= base_dist; job->ctx_params.task_dist = opt_local->distribution; if (opt_local->ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt_local->ntasks_per_node; break; } job->ctx_params.overcommit = opt_local->overcommit ? 1 : 0; job->ctx_params.node_list = opt_local->nodelist; job->ctx_params.network = opt_local->network; job->ctx_params.no_kill = opt_local->no_kill; if (srun_opt->job_name_set_cmd && opt_local->job_name) job->ctx_params.name = opt_local->job_name; else job->ctx_params.name = srun_opt->cmd_name; job->ctx_params.features = opt_local->constraints; if (opt_local->cpus_per_gpu) { xstrfmtcat(job->ctx_params.cpus_per_tres, "gpu:%d", opt_local->cpus_per_gpu); } xfree(opt_local->tres_bind); /* Vestigial value from job allocate */ if (opt_local->gpu_bind) xstrfmtcat(opt_local->tres_bind, "gpu:%s", opt_local->gpu_bind); if (tres_bind_verify_cmdline(opt_local->tres_bind)) { if (tres_bind_err_log) { /* Log once */ error("Invalid --tres-bind argument: %s. Ignored", opt_local->tres_bind); tres_bind_err_log = false; } xfree(opt_local->tres_bind); } job->ctx_params.tres_bind = xstrdup(opt_local->tres_bind); xfree(opt_local->tres_freq); /* Vestigial value from job allocate */ xfmt_tres_freq(&opt_local->tres_freq, "gpu", opt_local->gpu_freq); if (tres_freq_verify_cmdline(opt_local->tres_freq)) { if (tres_freq_err_log) { /* Log once */ error("Invalid --tres-freq argument: %s. Ignored", opt_local->tres_freq); tres_freq_err_log = false; } xfree(opt_local->tres_freq); } job->ctx_params.tres_freq = xstrdup(opt_local->tres_freq); job->ctx_params.tres_per_step = xstrdup(opt_local->tres_per_job); xfmt_tres(&job->ctx_params.tres_per_step, "gpu", opt_local->gpus); xfmt_tres(&job->ctx_params.tres_per_node, "gpu", opt_local->gpus_per_node); if (opt_local->gres) add_tres = opt_local->gres; else add_tres = getenv("SLURM_STEP_GRES"); if (add_tres) { if (job->ctx_params.tres_per_node) { xstrfmtcat(job->ctx_params.tres_per_node, ",%s", add_tres); } else job->ctx_params.tres_per_node = xstrdup(add_tres); } xfmt_tres(&job->ctx_params.tres_per_socket, "gpu", opt_local->gpus_per_socket); xfmt_tres(&job->ctx_params.tres_per_task, "gpu", opt_local->gpus_per_task); if (opt_local->mem_per_gpu) { xstrfmtcat(job->ctx_params.mem_per_tres, "gpu:%"PRIi64, opt.mem_per_gpu); } debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); for (i = 0; (!(*destroy_job)); i++) { if (srun_opt->no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else { if (opt_local->immediate) { step_wait = MAX(1, opt_local->immediate - difftime(time(NULL), srun_begin_time)) * 1000; } else { slurmctld_timeout = MIN(300, MAX(60, slurm_get_slurmctld_timeout())); step_wait = ((getpid() % 10) + slurmctld_timeout) * 1000; } job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) { info("Step created for job %u", job->ctx_params.job_id); } break; } rc = slurm_get_errno(); if (((opt_local->immediate != 0) && ((opt_local->immediate == 1) || (difftime(time(NULL), srun_begin_time) >= opt_local->immediate))) || ((rc != ESLURM_PROLOG_RUNNING) && !slurm_step_retry_errno(rc))) { error("Unable to create step for job %u: %m", job->ctx_params.job_id); return SLURM_ERROR; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job %u step creation temporarily disabled, retrying", job->ctx_params.job_id); } xsignal_unblock(sig_array); for (j = 0; sig_array[j]; j++) xsignal(sig_array[j], signal_function); } else { verbose("Job %u step creation still disabled, retrying", job->ctx_params.job_id); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending step for job %u", job->ctx_params.job_id); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* * Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job, opt_local); return SLURM_SUCCESS; }
/* * Allocate nodes for heterogeneous/pack job from the slurm controller -- * retrying the attempt if the controller appears to be down, and optionally * waiting for resources if none are currently available (see opt.immediate) * * Returns a pointer to a resource_allocation_response_msg which must * be freed with slurm_free_resource_allocation_response_msg() */ List allocate_pack_nodes(bool handle_signals) { resource_allocation_response_msg_t *resp = NULL; bool jobid_log = true; job_desc_msg_t *j, *first_job = NULL; slurm_allocation_callbacks_t callbacks; ListIterator opt_iter, resp_iter; slurm_opt_t *opt_local, *first_opt = NULL; List job_req_list = NULL, job_resp_list = NULL; uint32_t my_job_id = 0; int i, k; job_req_list = list_create(NULL); opt_iter = list_iterator_create(opt_list); while ((opt_local = list_next(opt_iter))) { srun_opt_t *srun_opt = opt_local->srun_opt; xassert(srun_opt); if (!first_opt) first_opt = opt_local; if (srun_opt->relative_set && srun_opt->relative) fatal("--relative option invalid for job allocation request"); if ((j = _job_desc_msg_create_from_opts(opt_local)) == NULL) return NULL; if (!first_job) first_job = j; j->origin_cluster = xstrdup(slurmctld_conf.cluster_name); /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt_local->jobid_set) { if (jobid_log) { jobid_log = false; /* log once */ info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); } if (!opt_local->jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } list_append(job_req_list, j); } list_iterator_destroy(opt_iter); if (!first_job) { error("%s: No job requests found", __func__); return NULL; } if (first_opt && first_opt->clusters && (slurmdb_get_first_pack_cluster(job_req_list, first_opt->clusters, &working_cluster_rec) != SLURM_SUCCESS)) { print_db_notok(first_opt->clusters, 0); return NULL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&first_job->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (first_opt && !job_resp_list) { job_resp_list = slurm_allocate_pack_job_blocking(job_req_list, first_opt->immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!job_resp_list && !_retry()) { break; } } if (job_resp_list && !destroy_job) { /* * Allocation granted! */ opt_iter = list_iterator_create(opt_list); resp_iter = list_iterator_create(job_resp_list); while ((opt_local = list_next(opt_iter))) { resp = (resource_allocation_response_msg_t *) list_next(resp_iter); if (!resp) break; if (pending_job_id == 0) pending_job_id = resp->job_id; if (my_job_id == 0) { my_job_id = resp->job_id; i = list_count(opt_list); k = list_count(job_resp_list); if (i != k) { error("%s: request count != response count (%d != %d)", __func__, i, k); goto relinquish; } } /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. * * NOTE: pn_min_memory here is an int64, not uint64. * These operations may have some bizarre side effects */ if (opt_local->pn_min_memory != NO_VAL64) opt_local->pn_min_memory = (resp->pn_min_memory & (~MEM_PER_CPU)); else if (opt_local->mem_per_cpu != NO_VAL64) opt_local->mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt_local->min_nodes = node_cnt; opt_local->max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt_local->min_nodes = resp->node_cnt; opt_local->max_nodes = resp->node_cnt; if (resp->working_cluster_rec) slurm_setup_remote_working_cluster(resp); if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the nodes."); goto relinquish; } #endif } list_iterator_destroy(resp_iter); list_iterator_destroy(opt_iter); } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); return job_resp_list; relinquish: if (job_resp_list) { if (!destroy_job && my_job_id) slurm_complete_job(my_job_id, 1); list_destroy(job_resp_list); } exit(error_exit); return NULL; }
/* * Allocate nodes from the slurm controller -- retrying the attempt * if the controller appears to be down, and optionally waiting for * resources if none are currently available (see opt.immediate) * * Returns a pointer to a resource_allocation_response_msg which must * be freed with slurm_free_resource_allocation_response_msg() */ extern resource_allocation_response_msg_t * allocate_nodes(bool handle_signals, slurm_opt_t *opt_local) { srun_opt_t *srun_opt = opt_local->srun_opt; resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j; slurm_allocation_callbacks_t callbacks; int i; xassert(srun_opt); if (srun_opt->relative_set && srun_opt->relative) fatal("--relative option invalid for job allocation request"); if ((j = _job_desc_msg_create_from_opts(&opt)) == NULL) return NULL; if (opt_local->clusters && (slurmdb_get_first_avail_cluster(j, opt_local->clusters, &working_cluster_rec) != SLURM_SUCCESS)) { print_db_notok(opt_local->clusters, 0); return NULL; } j->origin_cluster = xstrdup(slurmctld_conf.cluster_name); /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt_local->jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt_local->jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (!resp) { resp = slurm_allocate_resources_blocking(j, opt_local->immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp) print_multi_line_string(resp->job_submit_user_msg, -1); if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. */ opt_local->pn_min_memory = NO_VAL64; opt_local->mem_per_cpu = NO_VAL64; if (resp->pn_min_memory != NO_VAL64) { if (resp->pn_min_memory & MEM_PER_CPU) { opt_local->mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); } else { opt_local->pn_min_memory = resp->pn_min_memory; } } #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt_local->min_nodes = node_cnt; opt_local->max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt_local->min_nodes = resp->node_cnt; opt_local->max_nodes = resp->node_cnt; if (resp->working_cluster_rec) slurm_setup_remote_working_cluster(resp); if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: if (resp) { if (!destroy_job) slurm_complete_job(resp->job_id, 1); slurm_free_resource_allocation_response_msg(resp); } exit(error_exit); return NULL; }