extern int create_job_step(srun_job_t *job, bool use_all_cpus) { int i, rc; unsigned long my_sleep = 0; time_t begin_time; slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* set the jobid for totalview */ totalview_jobid = NULL; xstrfmtcat(totalview_jobid, "%u", job->ctx_params.job_id); /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return -1; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return -1; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.mem_per_cpu = opt.mem_per_cpu; job->ctx_params.gres = opt.gres; if (use_all_cpus) job->ctx_params.cpu_count = job->cpu_count; else if (opt.overcommit) job->ctx_params.cpu_count = job->ctx_params.min_nodes; else job->ctx_params.cpu_count = opt.ntasks*opt.cpus_per_task; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.gres = opt.gres; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; switch (opt.distribution) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: job->ctx_params.task_dist = opt.distribution; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: job->ctx_params.task_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution = job->ctx_params.task_dist; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!destroy_job); i++) { if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else job->step_ctx = slurm_step_ctx_create( &job->ctx_params); if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return -1; } if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); my_sleep = MIN((my_sleep * 2), 29000000); } /* sleep 0.1 to 29 secs with exponential back-off */ usleep(my_sleep); if (destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (destroy_job) { info("Cancelled pending job step"); return -1; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return 0; }
extern int launch_common_create_job_step(srun_job_t *job, bool use_all_cpus, void (*signal_function)(int), sig_atomic_t *destroy_job) { int i, rc; unsigned long step_wait = 0, my_sleep = 0; time_t begin_time; uint16_t base_dist; if (!job) { error("launch_common_create_job_step: no job given"); return SLURM_ERROR; } slurm_step_ctx_params_t_init(&job->ctx_params); job->ctx_params.job_id = job->jobid; job->ctx_params.uid = opt.uid; /* Validate minimum and maximum node counts */ if (opt.min_nodes && opt.max_nodes && (opt.min_nodes > opt.max_nodes)) { error ("Minimum node count > maximum node count (%d > %d)", opt.min_nodes, opt.max_nodes); return SLURM_ERROR; } #if !defined HAVE_FRONT_END || (defined HAVE_BGQ) //#if !defined HAVE_FRONT_END || (defined HAVE_BGQ && defined HAVE_BG_FILES) if (opt.min_nodes && (opt.min_nodes > job->nhosts)) { error ("Minimum node count > allocated node count (%d > %d)", opt.min_nodes, job->nhosts); return SLURM_ERROR; } #endif job->ctx_params.min_nodes = job->nhosts; if (opt.min_nodes && (opt.min_nodes < job->ctx_params.min_nodes)) job->ctx_params.min_nodes = opt.min_nodes; job->ctx_params.max_nodes = job->nhosts; if (opt.max_nodes && (opt.max_nodes < job->ctx_params.max_nodes)) job->ctx_params.max_nodes = opt.max_nodes; if (!opt.ntasks_set && (opt.ntasks_per_node != NO_VAL)) job->ntasks = opt.ntasks = job->nhosts * opt.ntasks_per_node; job->ctx_params.task_count = opt.ntasks; if (opt.mem_per_cpu != NO_VAL) job->ctx_params.pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU; else if (opt.pn_min_memory != NO_VAL) job->ctx_params.pn_min_memory = opt.pn_min_memory; if (opt.gres) job->ctx_params.gres = opt.gres; else job->ctx_params.gres = getenv("SLURM_STEP_GRES"); if (opt.overcommit) { if (use_all_cpus) /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; else job->ctx_params.cpu_count = job->ctx_params.min_nodes; } else if (opt.cpus_set) { job->ctx_params.cpu_count = opt.ntasks * opt.cpus_per_task; } else if (opt.ntasks_set) { job->ctx_params.cpu_count = opt.ntasks; } else if (use_all_cpus) { /* job allocation created by srun */ job->ctx_params.cpu_count = job->cpu_count; } else { job->ctx_params.cpu_count = opt.ntasks; } job->ctx_params.cpu_freq_min = opt.cpu_freq_min; job->ctx_params.cpu_freq_max = opt.cpu_freq_max; job->ctx_params.cpu_freq_gov = opt.cpu_freq_gov; job->ctx_params.relative = (uint16_t)opt.relative; job->ctx_params.ckpt_interval = (uint16_t)opt.ckpt_interval; job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.exclusive = (uint16_t)opt.exclusive; if (opt.immediate == 1) job->ctx_params.immediate = (uint16_t)opt.immediate; if (opt.time_limit != NO_VAL) job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt; else { #if defined(HAVE_NATIVE_CRAY) /* * On Cray systems default to reserving one port, or one * more than the number of multi prog commands, for Cray PMI */ job->ctx_params.resv_port_cnt = (opt.multi_prog ? opt.multi_prog_cmds + 1 : 1); #endif } switch (opt.distribution & SLURM_DIST_STATE_BASE) { case SLURM_DIST_BLOCK: case SLURM_DIST_ARBITRARY: case SLURM_DIST_CYCLIC: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_BLOCK_CYCLIC: case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_CFULL: case SLURM_DIST_BLOCK_CFULL: job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; case SLURM_DIST_PLANE: job->ctx_params.task_dist = SLURM_DIST_PLANE; job->ctx_params.plane_size = opt.plane_size; break; default: base_dist = (job->ctx_params.task_count <= job->ctx_params.min_nodes) ? SLURM_DIST_CYCLIC : SLURM_DIST_BLOCK; opt.distribution &= SLURM_DIST_STATE_FLAGS; opt.distribution |= base_dist; job->ctx_params.task_dist = opt.distribution; if (opt.ntasks_per_node != NO_VAL) job->ctx_params.plane_size = opt.ntasks_per_node; break; } job->ctx_params.overcommit = opt.overcommit ? 1 : 0; job->ctx_params.node_list = opt.nodelist; job->ctx_params.network = opt.network; job->ctx_params.no_kill = opt.no_kill; if (opt.job_name_set_cmd && opt.job_name) job->ctx_params.name = opt.job_name; else job->ctx_params.name = opt.cmd_name; job->ctx_params.features = opt.constraints; debug("requesting job %u, user %u, nodes %u including (%s)", job->ctx_params.job_id, job->ctx_params.uid, job->ctx_params.min_nodes, job->ctx_params.node_list); debug("cpus %u, tasks %u, name %s, relative %u", job->ctx_params.cpu_count, job->ctx_params.task_count, job->ctx_params.name, job->ctx_params.relative); begin_time = time(NULL); for (i=0; (!(*destroy_job)); i++) { bool blocking_step_create = true; if (opt.no_alloc) { job->step_ctx = slurm_step_ctx_create_no_alloc( &job->ctx_params, job->stepid); } else if (opt.immediate) { job->step_ctx = slurm_step_ctx_create( &job->ctx_params); } else { /* Wait 60 to 70 seconds for response */ step_wait = (getpid() % 10) * 1000 + 60000; job->step_ctx = slurm_step_ctx_create_timeout( &job->ctx_params, step_wait); } if (job->step_ctx != NULL) { if (i > 0) info("Job step created"); break; } rc = slurm_get_errno(); if (((opt.immediate != 0) && ((opt.immediate == 1) || (difftime(time(NULL), begin_time) > opt.immediate))) || ((rc != ESLURM_NODES_BUSY) && (rc != ESLURM_PORTS_BUSY) && (rc != ESLURM_PROLOG_RUNNING) && (rc != SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT) && (rc != ESLURM_INTERCONNECT_BUSY) && (rc != ESLURM_DISABLED))) { error ("Unable to create job step: %m"); return SLURM_ERROR; } if (rc == ESLURM_DISABLED) /* job suspended */ blocking_step_create = false; if (i == 0) { if (rc == ESLURM_PROLOG_RUNNING) { verbose("Resources allocated for job %u and " "being configured, please wait", job->ctx_params.job_id); } else { info("Job step creation temporarily disabled, " "retrying"); } xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], signal_function); if (!blocking_step_create) my_sleep = (getpid() % 1000) * 100 + 100000; } else { verbose("Job step creation still disabled, retrying"); if (!blocking_step_create) my_sleep *= 2; } if (!blocking_step_create) { /* sleep 0.1 to 29 secs with exponential back-off */ my_sleep = MIN(my_sleep, 29000000); usleep(my_sleep); } if (*destroy_job) { /* cancelled by signal */ break; } } if (i > 0) { xsignal_block(sig_array); if (*destroy_job) { info("Cancelled pending job step"); return SLURM_ERROR; } } slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_STEPID, &job->stepid); /* Number of hosts in job may not have been initialized yet if * --jobid was used or only SLURM_JOB_ID was set in user env. * Reset the value here just in case. */ slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_NUM_HOSTS, &job->nhosts); /* * Recreate filenames which may depend upon step id */ job_update_io_fnames(job); return SLURM_SUCCESS; }
int main (int argc, char *argv[]) { int i, min_nodes = 1, max_nodes = 1, nodes, tasks = 0, rc = 0; job_desc_msg_t job_req; resource_allocation_response_msg_t *job_resp; slurm_step_ctx_params_t step_params[1]; slurm_step_ctx_t *ctx = NULL; slurm_step_launch_params_t launch[1]; char *task_argv[3]; int *fd_array = NULL; int num_fd; if (argc > 1) { i = atoi(argv[1]); if (i > 0) min_nodes = i; } if (argc > 2) { i = atoi(argv[2]); if (i > 0) max_nodes = i; } if (max_nodes < min_nodes) max_nodes = min_nodes; /* Create a job allocation */ slurm_init_job_desc_msg( &job_req ); job_req.min_nodes = min_nodes; job_req.max_nodes = max_nodes; job_req.user_id = getuid(); job_req.group_id = getgid(); job_req.time_limit = 1; if (slurm_allocate_resources(&job_req, &job_resp)) { slurm_perror ("slurm_allocate_resources"); printf("INFO: min_nodes=%u max_nodes=%u user_id=%u group_id=%u", job_req.min_nodes, job_req.max_nodes, job_req.user_id, job_req.group_id); exit(0); } printf("job_id %u\n", job_resp->job_id); fflush(stdout); /* Wait for allocation request to be satisfied */ if ((job_resp->node_list == NULL) || (strlen(job_resp->node_list) == 0)) { printf("Waiting for resource allocation\n"); fflush(stdout); while ((job_resp->node_list == NULL) || (strlen(job_resp->node_list) == 0)) { sleep(5); if (slurm_allocation_lookup_lite(job_resp->job_id, &job_resp) && (slurm_get_errno() != ESLURM_JOB_PENDING)) { slurm_perror("slurm_confirm_allocation"); exit(0); } } } nodes = job_resp->node_cnt; if (argc > 3) tasks = atoi(argv[3]); if (tasks < 1) tasks = nodes * TASKS_PER_NODE; if (tasks < nodes) { fprintf(stderr, "Invalid task count argument\n"); exit(1); } printf("Starting %d tasks on %d nodes\n", tasks, nodes); fflush(stdout); /* * Create a job step context. */ slurm_step_ctx_params_t_init(step_params); step_params->job_id = job_resp->job_id; step_params->min_nodes = nodes; step_params->task_count = tasks; ctx = slurm_step_ctx_create(step_params); if ((ctx == NULL) && (slurm_get_errno() == ESLURM_PROLOG_RUNNING)) { printf("SlurmctldProlog is still running, " "sleep and try again\n"); sleep(10); ctx = slurm_step_ctx_create(step_params); } if (ctx == NULL) { slurm_perror("slurm_step_ctx_create"); rc = 1; goto done; } /* * Hack to run one task per node, regardless of what we set up * when we created the job step context. */ if (slurm_step_ctx_daemon_per_node_hack(ctx) != SLURM_SUCCESS) { slurm_perror("slurm_step_ctx_daemon_per_node_hack"); rc = 1; goto done; } /* * Launch the tasks using "user managed" IO. * "user managed" IO means a TCP stream for each task, directly * connected to the stdin, stdout, and stderr the task. */ slurm_step_launch_params_t_init(launch); task_argv[0] = "./test7.3.io"; launch->argv = task_argv; launch->argc = 1; launch->user_managed_io = true; /* This is the key to using "user managed" IO */ if (slurm_step_launch(ctx, launch, NULL) != SLURM_SUCCESS) { slurm_perror("slurm_step_launch"); rc = 1; goto done; } if (slurm_step_launch_wait_start(ctx) != SLURM_SUCCESS) { slurm_perror("slurm_step_launch_wait_start"); rc =1; goto done; } slurm_step_ctx_get(ctx, SLURM_STEP_CTX_USER_MANAGED_SOCKETS, &num_fd, &fd_array); /* Interact with launched tasks as desired */ _do_task_work(fd_array, tasks); for (i = 0; i < tasks; i++) { close(fd_array[i]); } slurm_step_launch_wait_finish(ctx); /* Terminate the job killing all tasks */ done: slurm_kill_job(job_resp->job_id, SIGKILL, 0); /* clean up storage */ slurm_free_resource_allocation_response_msg(job_resp); if (ctx) slurm_step_ctx_destroy(ctx); exit(0); }