extern slurm_step_layout_t *launch_common_get_slurm_step_layout(srun_job_t *job) { job_step_create_response_msg_t *resp; if (!job || !job->step_ctx) return (NULL); slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_RESP, &resp); if (!resp) return (NULL); return (resp->step_layout); }
static void _handle_openmpi_port_error(const char *tasks, const char *hosts, slurm_step_ctx_t *step_ctx) { uint32_t job_id, step_id; char *msg = "retrying"; if (!retry_step_begin) { retry_step_begin = true; retry_step_cnt++; } if (retry_step_cnt >= MAX_STEP_RETRIES) msg = "aborting"; error("%s: tasks %s unable to claim reserved port, %s.", hosts, tasks, msg); slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_JOBID, &job_id); slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_STEPID, &step_id); info("Terminating job step %u.%u", job_id, step_id); slurm_kill_job_step(job_id, step_id, SIGKILL); }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job) { int i, rc; unsigned long step_wait = 0, my_sleep = 0; time_t begin_time; uint16_t base_dist; if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU; else if (opt.pn_min_memory != NO_VAL) job->ctx_params.pn_min_memory = opt.pn_min_memory; if (opt.gres) job->ctx_params.gres = opt.gres; else job->ctx_params.gres = getenv("SLURM_STEP_GRES"); if (opt.overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt.cpus_set) { job->ctx_params.cpu_count = opt.ntasks * opt.cpus_per_task; } else if (opt.ntasks_set) { job->ctx_params.cpu_count = opt.ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt.ntasks; } job->ctx_params.cpu_freq_min = opt.cpu_freq_min; job->ctx_params.cpu_freq_max = opt.cpu_freq_max; job->ctx_params.cpu_freq_gov = opt.cpu_freq_gov; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (opt.multi_prog ? opt.multi_prog_cmds + 1 : 1); #endif } switch (opt.distribution & SLURM_DIST_STATE_BASE) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution &= SLURM_DIST_STATE_FLAGS; opt.distribution |= base_dist; job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; job->ctx_params.features = opt.constraints; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!(*destroy_job)); i++) { bool blocking_step_create = true; if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else if (opt.immediate) { job->step_ctx = slurm_step_ctx_create( &job->ctx_params); } else { /* Wait 60 to 70 seconds for response */ step_wait = (getpid() % 10) * 1000 + 60000; job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_INTERCONNECT_BUSY) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return SLURM_ERROR; } if (rc == ESLURM_DISABLED) /* job suspended */ blocking_step_create = false; if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], signal_function); if (!blocking_step_create) my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); if (!blocking_step_create) my_sleep *= 2; } if (!blocking_step_create) { /* sleep 0.1 to 29 secs with exponential back-off */ my_sleep = MIN(my_sleep, 29000000); usleep(my_sleep); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending job step"); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return SLURM_SUCCESS; }
extern int create_job_step(srun_job_t *job, bool use_all_cpus) { int i, rc; unsigned long my_sleep = 0; time_t begin_time; slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* set the jobid for totalview */ totalview_jobid = NULL; xstrfmtcat(totalview_jobid, "%u", job->ctx_params.job_id); /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return -1; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return -1; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.mem_per_cpu = opt.mem_per_cpu; job->ctx_params.gres = opt.gres; if (use_all_cpus) job->ctx_params.cpu_count = job->cpu_count; else if (opt.overcommit) job->ctx_params.cpu_count = job->ctx_params.min_nodes; else job->ctx_params.cpu_count = opt.ntasks*opt.cpus_per_task; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.gres = opt.gres; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; switch (opt.distribution) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: job->ctx_params.task_dist = opt.distribution; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: job->ctx_params.task_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution = job->ctx_params.task_dist; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!destroy_job); i++) { if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else job->step_ctx = slurm_step_ctx_create( &job->ctx_params); if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return -1; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); my_sleep = MIN((my_sleep * 2), 29000000); } /* sleep 0.1 to 29 secs with exponential back-off */ usleep(my_sleep); if (destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (destroy_job) { info("Cancelled pending job step"); return -1; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return 0; }
int main (int argc, char *argv[]) { int i, min_nodes = 1, max_nodes = 1, nodes, tasks = 0, rc = 0; job_desc_msg_t job_req; resource_allocation_response_msg_t *job_resp; slurm_step_ctx_params_t step_params[1]; slurm_step_ctx_t *ctx = NULL; slurm_step_launch_params_t launch[1]; char *task_argv[3]; int *fd_array = NULL; int num_fd; if (argc > 1) { i = atoi(argv[1]); if (i > 0) min_nodes = i; } if (argc > 2) { i = atoi(argv[2]); if (i > 0) max_nodes = i; } if (max_nodes < min_nodes) max_nodes = min_nodes; /* Create a job allocation */ slurm_init_job_desc_msg( &job_req ); job_req.min_nodes = min_nodes; job_req.max_nodes = max_nodes; job_req.user_id = getuid(); job_req.group_id = getgid(); job_req.time_limit = 1; if (slurm_allocate_resources(&job_req, &job_resp)) { slurm_perror ("slurm_allocate_resources"); printf("INFO: min_nodes=%u max_nodes=%u user_id=%u group_id=%u", job_req.min_nodes, job_req.max_nodes, job_req.user_id, job_req.group_id); exit(0); } printf("job_id %u\n", job_resp->job_id); fflush(stdout); /* Wait for allocation request to be satisfied */ if ((job_resp->node_list == NULL) || (strlen(job_resp->node_list) == 0)) { printf("Waiting for resource allocation\n"); fflush(stdout); while ((job_resp->node_list == NULL) || (strlen(job_resp->node_list) == 0)) { sleep(5); if (slurm_allocation_lookup_lite(job_resp->job_id, &job_resp) && (slurm_get_errno() != ESLURM_JOB_PENDING)) { slurm_perror("slurm_confirm_allocation"); exit(0); } } } nodes = job_resp->node_cnt; if (argc > 3) tasks = atoi(argv[3]); if (tasks < 1) tasks = nodes * TASKS_PER_NODE; if (tasks < nodes) { fprintf(stderr, "Invalid task count argument\n"); exit(1); } printf("Starting %d tasks on %d nodes\n", tasks, nodes); fflush(stdout); /* * Create a job step context. */ slurm_step_ctx_params_t_init(step_params); step_params->job_id = job_resp->job_id; step_params->min_nodes = nodes; step_params->task_count = tasks; ctx = slurm_step_ctx_create(step_params); if ((ctx == NULL) && (slurm_get_errno() == ESLURM_PROLOG_RUNNING)) { printf("SlurmctldProlog is still running, " "sleep and try again\n"); sleep(10); ctx = slurm_step_ctx_create(step_params); } if (ctx == NULL) { slurm_perror("slurm_step_ctx_create"); rc = 1; goto done; } /* * Hack to run one task per node, regardless of what we set up * when we created the job step context. */ if (slurm_step_ctx_daemon_per_node_hack(ctx) != SLURM_SUCCESS) { slurm_perror("slurm_step_ctx_daemon_per_node_hack"); rc = 1; goto done; } /* * Launch the tasks using "user managed" IO. * "user managed" IO means a TCP stream for each task, directly * connected to the stdin, stdout, and stderr the task. */ slurm_step_launch_params_t_init(launch); task_argv[0] = "./test7.3.io"; launch->argv = task_argv; launch->argc = 1; launch->user_managed_io = true; /* This is the key to using "user managed" IO */ if (slurm_step_launch(ctx, launch, NULL) != SLURM_SUCCESS) { slurm_perror("slurm_step_launch"); rc = 1; goto done; } if (slurm_step_launch_wait_start(ctx) != SLURM_SUCCESS) { slurm_perror("slurm_step_launch_wait_start"); rc =1; goto done; } slurm_step_ctx_get(ctx, SLURM_STEP_CTX_USER_MANAGED_SOCKETS, &num_fd, &fd_array); /* Interact with launched tasks as desired */ _do_task_work(fd_array, tasks); for (i = 0; i < tasks; i++) { close(fd_array[i]); } slurm_step_launch_wait_finish(ctx); /* Terminate the job killing all tasks */ done: slurm_kill_job(job_resp->job_id, SIGKILL, 0); /* clean up storage */ slurm_free_resource_allocation_response_msg(job_resp); if (ctx) slurm_step_ctx_destroy(ctx); exit(0); }
int srun(int ac, char **av) { int debug_level; env_t *env = xmalloc(sizeof(env_t)); log_options_t logopt = LOG_OPTS_STDERR_ONLY; bool got_alloc = false; slurm_step_io_fds_t cio_fds = SLURM_STEP_IO_FDS_INITIALIZER; slurm_step_launch_callbacks_t step_callbacks; env->stepid = -1; env->procid = -1; env->localid = -1; env->nodeid = -1; env->cli = NULL; env->env = NULL; env->ckpt_dir = NULL; slurm_conf_init(NULL); debug_level = _slurm_debug_env_val(); logopt.stderr_level += debug_level; log_init(xbasename(av[0]), logopt, 0, NULL); _set_exit_code(); if (slurm_select_init(1) != SLURM_SUCCESS ) fatal( "failed to initialize node selection plugin" ); if (switch_init() != SLURM_SUCCESS ) fatal("failed to initialize switch plugin"); init_srun(ac, av, &logopt, debug_level, 1); create_srun_job(&job, &got_alloc, 0, 1); /* * Enhance environment for job */ if (opt.bcast_flag) _file_bcast(); if (opt.cpus_set) env->cpus_per_task = opt.cpus_per_task; if (opt.ntasks_per_node != NO_VAL) env->ntasks_per_node = opt.ntasks_per_node; if (opt.ntasks_per_socket != NO_VAL) env->ntasks_per_socket = opt.ntasks_per_socket; if (opt.ntasks_per_core != NO_VAL) env->ntasks_per_core = opt.ntasks_per_core; env->distribution = opt.distribution; if (opt.plane_size != NO_VAL) env->plane_size = opt.plane_size; env->cpu_bind_type = opt.cpu_bind_type; env->cpu_bind = opt.cpu_bind; env->cpu_freq_min = opt.cpu_freq_min; env->cpu_freq_max = opt.cpu_freq_max; env->cpu_freq_gov = opt.cpu_freq_gov; env->mem_bind_type = opt.mem_bind_type; env->mem_bind = opt.mem_bind; env->overcommit = opt.overcommit; env->slurmd_debug = opt.slurmd_debug; env->labelio = opt.labelio; env->comm_port = slurmctld_comm_addr.port; env->batch_flag = 0; if (opt.job_name) env->job_name = opt.job_name; if (job) { uint16_t *tasks = NULL; slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, &tasks); env->select_jobinfo = job->select_jobinfo; env->nodelist = job->nodelist; env->partition = job->partition; /* If we didn't get the allocation don't overwrite the * previous info. */ if (got_alloc) env->nhosts = job->nhosts; env->ntasks = job->ntasks; env->task_count = _uint16_array_to_str(job->nhosts, tasks); env->jobid = job->jobid; env->stepid = job->stepid; env->account = job->account; env->qos = job->qos; env->resv_name = job->resv_name; } if (opt.pty && (set_winsize(job) < 0)) { error("Not using a pseudo-terminal, disregarding --pty option"); opt.pty = false; } if (opt.pty) { struct termios term; int fd = STDIN_FILENO; /* Save terminal settings for restore */ tcgetattr(fd, &termdefaults); tcgetattr(fd, &term); /* Set raw mode on local tty */ cfmakeraw(&term); /* Re-enable output processing such that debug() and * and error() work properly. */ term.c_oflag |= OPOST; tcsetattr(fd, TCSANOW, &term); atexit(&_pty_restore); block_sigwinch(); pty_thread_create(job); env->pty_port = job->pty_port; env->ws_col = job->ws_col; env->ws_row = job->ws_row; } setup_env(env, opt.preserve_env); xfree(env->task_count); xfree(env); _set_node_alias(); memset(&step_callbacks, 0, sizeof(step_callbacks)); step_callbacks.step_signal = launch_g_fwd_signal; /* re_launch: */ relaunch: pre_launch_srun_job(job, 0, 1); launch_common_set_stdio_fds(job, &cio_fds); if (!launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks)) { if (launch_g_step_wait(job, got_alloc) == -1) goto relaunch; } fini_srun(job, got_alloc, &global_rc, 0); return (int)global_rc; }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job, slurm_opt_t *opt_local) { srun_opt_t *srun_opt = opt_local->srun_opt; int i, j, rc; unsigned long step_wait = 0; uint16_t base_dist, slurmctld_timeout; char *add_tres; xassert(srun_opt); if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.step_id = job->stepid; job->ctx_params.uid = opt_local->uid; /* Validate minimum and maximum node counts */ if (opt_local->min_nodes && opt_local->max_nodes && (opt_local->min_nodes > opt_local->max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt_local->min_nodes, opt_local->max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END if (opt_local->min_nodes && (opt_local->min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt_local->min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt_local->min_nodes && (opt_local->min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt_local->min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt_local->max_nodes && (opt_local->max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt_local->max_nodes; if (!opt_local->ntasks_set && (opt_local->ntasks_per_node != NO_VAL)) job->ntasks = opt_local->ntasks = job->nhosts * opt_local->ntasks_per_node; job->ctx_params.task_count = opt_local->ntasks; if (opt_local->mem_per_cpu != NO_VAL64) job->ctx_params.pn_min_memory = opt_local->mem_per_cpu | MEM_PER_CPU; else if (opt_local->pn_min_memory != NO_VAL64) job->ctx_params.pn_min_memory = opt_local->pn_min_memory; if (opt_local->overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt_local->cpus_set) { job->ctx_params.cpu_count = opt_local->ntasks * opt_local->cpus_per_task; } else if (opt_local->ntasks_set) { job->ctx_params.cpu_count = opt_local->ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt_local->ntasks; } job->ctx_params.cpu_freq_min = opt_local->cpu_freq_min; job->ctx_params.cpu_freq_max = opt_local->cpu_freq_max; job->ctx_params.cpu_freq_gov = opt_local->cpu_freq_gov; job->ctx_params.relative = (uint16_t)srun_opt->relative; job->ctx_params.ckpt_interval = (uint16_t)srun_opt->ckpt_interval; job->ctx_params.ckpt_dir = srun_opt->ckpt_dir; job->ctx_params.exclusive = (uint16_t)srun_opt->exclusive; if (opt_local->immediate == 1) job->ctx_params.immediate = (uint16_t)opt_local->immediate; if (opt_local->time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt_local->time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (srun_opt->resv_port_cnt != NO_VAL) { job->ctx_params.resv_port_cnt = (uint16_t)srun_opt->resv_port_cnt; } else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (srun_opt->multi_prog ? srun_opt->multi_prog_cmds + 1 : 1); #endif } switch (opt_local->distribution & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt_local->distribution; if (opt_local->ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt_local->ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt_local->plane_size; break; default: /* Leave distribution set to unknown if taskcount <= nodes and * memory is set to 0. step_mgr will handle the 0mem case. * ex. SallocDefaultCommand=srun -n1 -N1 --mem=0 ... */ if (!opt_local->mem_per_cpu || !opt_local->pn_min_memory) base_dist = SLURM_DIST_UNKNOWN; else base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt_local->distribution &= SLURM_DIST_STATE_FLAGS; opt_local->distribution |= base_dist; job->ctx_params.task_dist = opt_local->distribution; if (opt_local->ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt_local->ntasks_per_node; break; } job->ctx_params.overcommit = opt_local->overcommit ? 1 : 0; job->ctx_params.node_list = opt_local->nodelist; job->ctx_params.network = opt_local->network; job->ctx_params.no_kill = opt_local->no_kill; if (srun_opt->job_name_set_cmd && opt_local->job_name) job->ctx_params.name = opt_local->job_name; else job->ctx_params.name = srun_opt->cmd_name; job->ctx_params.features = opt_local->constraints; if (opt_local->cpus_per_gpu) { xstrfmtcat(job->ctx_params.cpus_per_tres, "gpu:%d", opt_local->cpus_per_gpu); } xfree(opt_local->tres_bind); /* Vestigial value from job allocate */ if (opt_local->gpu_bind) xstrfmtcat(opt_local->tres_bind, "gpu:%s", opt_local->gpu_bind); if (tres_bind_verify_cmdline(opt_local->tres_bind)) { if (tres_bind_err_log) { /* Log once */ error("Invalid --tres-bind argument: %s. Ignored", opt_local->tres_bind); tres_bind_err_log = false; } xfree(opt_local->tres_bind); } job->ctx_params.tres_bind = xstrdup(opt_local->tres_bind); xfree(opt_local->tres_freq); /* Vestigial value from job allocate */ xfmt_tres_freq(&opt_local->tres_freq, "gpu", opt_local->gpu_freq); if (tres_freq_verify_cmdline(opt_local->tres_freq)) { if (tres_freq_err_log) { /* Log once */ error("Invalid --tres-freq argument: %s. Ignored", opt_local->tres_freq); tres_freq_err_log = false; } xfree(opt_local->tres_freq); } job->ctx_params.tres_freq = xstrdup(opt_local->tres_freq); job->ctx_params.tres_per_step = xstrdup(opt_local->tres_per_job); xfmt_tres(&job->ctx_params.tres_per_step, "gpu", opt_local->gpus); xfmt_tres(&job->ctx_params.tres_per_node, "gpu", opt_local->gpus_per_node); if (opt_local->gres) add_tres = opt_local->gres; else add_tres = getenv("SLURM_STEP_GRES"); if (add_tres) { if (job->ctx_params.tres_per_node) { xstrfmtcat(job->ctx_params.tres_per_node, ",%s", add_tres); } else job->ctx_params.tres_per_node = xstrdup(add_tres); } xfmt_tres(&job->ctx_params.tres_per_socket, "gpu", opt_local->gpus_per_socket); xfmt_tres(&job->ctx_params.tres_per_task, "gpu", opt_local->gpus_per_task); if (opt_local->mem_per_gpu) { xstrfmtcat(job->ctx_params.mem_per_tres, "gpu:%"PRIi64, opt.mem_per_gpu); } debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); for (i = 0; (!(*destroy_job)); i++) { if (srun_opt->no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else { if (opt_local->immediate) { step_wait = MAX(1, opt_local->immediate - difftime(time(NULL), srun_begin_time)) * 1000; } else { slurmctld_timeout = MIN(300, MAX(60, slurm_get_slurmctld_timeout())); step_wait = ((getpid() % 10) + slurmctld_timeout) * 1000; } job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) { info("Step created for job %u", job->ctx_params.job_id); } break; } rc = slurm_get_errno(); if (((opt_local->immediate != 0) && ((opt_local->immediate == 1) || (difftime(time(NULL), srun_begin_time) >= opt_local->immediate))) || ((rc != ESLURM_PROLOG_RUNNING) && !slurm_step_retry_errno(rc))) { error("Unable to create step for job %u: %m", job->ctx_params.job_id); return SLURM_ERROR; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job %u step creation temporarily disabled, retrying", job->ctx_params.job_id); } xsignal_unblock(sig_array); for (j = 0; sig_array[j]; j++) xsignal(sig_array[j], signal_function); } else { verbose("Job %u step creation still disabled, retrying", job->ctx_params.job_id); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending step for job %u", job->ctx_params.job_id); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* * Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job, opt_local); return SLURM_SUCCESS; }
static void _setup_one_job_env(opt_t *opt_local, srun_job_t *job, bool got_alloc) { env_t *env = xmalloc(sizeof(env_t)); uint16_t *tasks = NULL; xassert(job); env->localid = -1; env->nodeid = -1; env->procid = -1; env->stepid = -1; if (opt_local->bcast_flag) _file_bcast(opt_local, job); if (opt_local->cpus_set) env->cpus_per_task = opt_local->cpus_per_task; if (opt_local->ntasks_per_node != NO_VAL) env->ntasks_per_node = opt_local->ntasks_per_node; if (opt_local->ntasks_per_socket != NO_VAL) env->ntasks_per_socket = opt_local->ntasks_per_socket; if (opt_local->ntasks_per_core != NO_VAL) env->ntasks_per_core = opt_local->ntasks_per_core; env->distribution = opt_local->distribution; if (opt_local->plane_size != NO_VAL) env->plane_size = opt_local->plane_size; env->cpu_bind_type = opt_local->cpu_bind_type; env->cpu_bind = opt_local->cpu_bind; env->cpu_freq_min = opt_local->cpu_freq_min; env->cpu_freq_max = opt_local->cpu_freq_max; env->cpu_freq_gov = opt_local->cpu_freq_gov; env->mem_bind_type = opt_local->mem_bind_type; env->mem_bind = opt_local->mem_bind; env->overcommit = opt_local->overcommit; env->slurmd_debug = opt_local->slurmd_debug; env->labelio = opt_local->labelio; env->comm_port = slurmctld_comm_addr.port; if (opt_local->job_name) env->job_name = opt_local->job_name; slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, &tasks); env->select_jobinfo = job->select_jobinfo; if (job->pack_node_list) env->nodelist = job->pack_node_list; else env->nodelist = job->nodelist; env->partition = job->partition; /* * If we didn't get the allocation don't overwrite the previous info. */ if (got_alloc) env->nhosts = job->nhosts; env->ntasks = job->ntasks; if (job->pack_ntasks != NO_VAL) env->ntasks = job->pack_ntasks; env->task_count = _uint16_array_to_str(job->nhosts, tasks); if (job->pack_jobid != NO_VAL) env->jobid = job->pack_jobid; else env->jobid = job->jobid; env->stepid = job->stepid; env->account = job->account; env->qos = job->qos; env->resv_name = job->resv_name; if (opt_local->pty && (set_winsize(job) < 0)) { error("Not using a pseudo-terminal, disregarding --pty option"); opt_local->pty = false; } if (opt_local->pty) { struct termios term; int fd = STDIN_FILENO; /* Save terminal settings for restore */ tcgetattr(fd, &termdefaults); tcgetattr(fd, &term); /* Set raw mode on local tty */ cfmakeraw(&term); /* Re-enable output processing such that debug() and * and error() work properly. */ term.c_oflag |= OPOST; tcsetattr(fd, TCSANOW, &term); atexit(&_pty_restore); block_sigwinch(); pty_thread_create(job); env->pty_port = job->pty_port; env->ws_col = job->ws_col; env->ws_row = job->ws_row; } env->env = env_array_copy((const char **) environ); setup_env(env, opt_local->preserve_env); job->env = env->env; xfree(env->task_count); xfree(env); }
static void _launch_app(srun_job_t *job, List srun_job_list, bool got_alloc) { ListIterator opt_iter, job_iter; opt_t *opt_local = NULL; _launch_app_data_t *opts; int total_ntasks = 0, total_nnodes = 0, step_cnt = 0, node_offset = 0; pthread_mutex_t step_mutex = PTHREAD_MUTEX_INITIALIZER; pthread_cond_t step_cond = PTHREAD_COND_INITIALIZER; srun_job_t *first_job = NULL; char *launch_type, *pack_node_list = NULL; bool need_mpir = false; uint16_t *tmp_task_cnt = NULL, *pack_task_cnts = NULL; uint32_t **tmp_tids = NULL, **pack_tids = NULL; launch_type = slurm_get_launch_type(); if (launch_type && strstr(launch_type, "slurm")) need_mpir = true; xfree(launch_type); if (srun_job_list) { int pack_step_cnt = list_count(srun_job_list); first_job = (srun_job_t *) list_peek(srun_job_list); if (!opt_list) { if (first_job) fini_srun(first_job, got_alloc, &global_rc, 0); fatal("%s: have srun_job_list, but no opt_list", __func__); } job_iter = list_iterator_create(srun_job_list); while ((job = (srun_job_t *) list_next(job_iter))) { char *node_list = NULL; int i, node_inx; total_ntasks += job->ntasks; total_nnodes += job->nhosts; xrealloc(pack_task_cnts, sizeof(uint16_t)*total_nnodes); (void) slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, &tmp_task_cnt); if (!tmp_task_cnt) { fatal("%s: job %u has NULL task array", __func__, job->jobid); break; /* To eliminate CLANG error */ } memcpy(pack_task_cnts + node_offset, tmp_task_cnt, sizeof(uint16_t) * job->nhosts); xrealloc(pack_tids, sizeof(uint32_t *) * total_nnodes); (void) slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TIDS, &tmp_tids); if (!tmp_tids) { fatal("%s: job %u has NULL task ID array", __func__, job->jobid); break; /* To eliminate CLANG error */ } for (node_inx = 0; node_inx < job->nhosts; node_inx++) { uint32_t *node_tids; node_tids = xmalloc(sizeof(uint32_t) * tmp_task_cnt[node_inx]); for (i = 0; i < tmp_task_cnt[node_inx]; i++) { node_tids[i] = tmp_tids[node_inx][i] + job->pack_task_offset; } pack_tids[node_offset + node_inx] = node_tids; } (void) slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NODE_LIST, &node_list); if (!node_list) { fatal("%s: job %u has NULL hostname", __func__, job->jobid); } if (pack_node_list) xstrfmtcat(pack_node_list, ",%s", node_list); else pack_node_list = xstrdup(node_list); xfree(node_list); node_offset += job->nhosts; } list_iterator_reset(job_iter); _reorder_pack_recs(&pack_node_list, &pack_task_cnts, &pack_tids, total_nnodes); if (need_mpir) mpir_init(total_ntasks); opt_iter = list_iterator_create(opt_list); while ((opt_local = (opt_t *) list_next(opt_iter))) { job = (srun_job_t *) list_next(job_iter); if (!job) { slurm_mutex_lock(&step_mutex); while (step_cnt > 0) slurm_cond_wait(&step_cond,&step_mutex); slurm_mutex_unlock(&step_mutex); if (first_job) { fini_srun(first_job, got_alloc, &global_rc, 0); } fatal("%s: job allocation count does not match request count (%d != %d)", __func__, list_count(srun_job_list), list_count(opt_list)); break; /* To eliminate CLANG error */ } slurm_mutex_lock(&step_mutex); step_cnt++; slurm_mutex_unlock(&step_mutex); job->pack_node_list = xstrdup(pack_node_list); if ((pack_step_cnt > 1) && pack_task_cnts) { xassert(node_offset == job->pack_nnodes); job->pack_task_cnts = xmalloc(sizeof(uint16_t) * job->pack_nnodes); memcpy(job->pack_task_cnts, pack_task_cnts, sizeof(uint16_t) * job->pack_nnodes); job->pack_tids = xmalloc(sizeof(uint32_t *) * job->pack_nnodes); memcpy(job->pack_tids, pack_tids, sizeof(uint32_t *) * job->pack_nnodes); } opts = xmalloc(sizeof(_launch_app_data_t)); opts->got_alloc = got_alloc; opts->job = job; opts->opt_local = opt_local; opts->step_cond = &step_cond; opts->step_cnt = &step_cnt; opts->step_mutex = &step_mutex; opt_local->pack_step_cnt = pack_step_cnt; slurm_thread_create_detached(NULL, _launch_one_app, opts); } xfree(pack_node_list); xfree(pack_task_cnts); list_iterator_destroy(job_iter); list_iterator_destroy(opt_iter); slurm_mutex_lock(&step_mutex); while (step_cnt > 0) slurm_cond_wait(&step_cond, &step_mutex); slurm_mutex_unlock(&step_mutex); if (first_job) fini_srun(first_job, got_alloc, &global_rc, 0); } else { if (need_mpir) mpir_init(job->ntasks); opts = xmalloc(sizeof(_launch_app_data_t)); opts->got_alloc = got_alloc; opts->job = job; opts->opt_local = &opt; opt.pack_step_cnt = 1; _launch_one_app(opts); fini_srun(job, got_alloc, &global_rc, 0); } }