extern int launch_p_step_wait(srun_job_t *job, bool got_alloc) { int rc = 0; slurm_step_launch_wait_finish(job->step_ctx); if ((MPIR_being_debugged == 0) && retry_step_begin && (retry_step_cnt < MAX_STEP_RETRIES)) { retry_step_begin = false; slurm_step_ctx_destroy(job->step_ctx); if (got_alloc) { if (create_job_step(job, true) < 0) exit(error_exit); } else { if (create_job_step(job, false) < 0) exit(error_exit); } task_state_destroy(task_state); rc = -1; } return rc; }
int main (int argc, char *argv[]) { int i, min_nodes = 1, max_nodes = 1, nodes, tasks = 0, rc = 0; job_desc_msg_t job_req; resource_allocation_response_msg_t *job_resp; slurm_step_ctx_params_t step_params[1]; slurm_step_ctx_t *ctx = NULL; slurm_step_launch_params_t launch[1]; char *task_argv[3]; int *fd_array = NULL; int num_fd; if (argc > 1) { i = atoi(argv[1]); if (i > 0) min_nodes = i; } if (argc > 2) { i = atoi(argv[2]); if (i > 0) max_nodes = i; } if (max_nodes < min_nodes) max_nodes = min_nodes; /* Create a job allocation */ slurm_init_job_desc_msg( &job_req ); job_req.min_nodes = min_nodes; job_req.max_nodes = max_nodes; job_req.user_id = getuid(); job_req.group_id = getgid(); job_req.time_limit = 1; if (slurm_allocate_resources(&job_req, &job_resp)) { slurm_perror ("slurm_allocate_resources"); printf("INFO: min_nodes=%u max_nodes=%u user_id=%u group_id=%u", job_req.min_nodes, job_req.max_nodes, job_req.user_id, job_req.group_id); exit(0); } printf("job_id %u\n", job_resp->job_id); fflush(stdout); /* Wait for allocation request to be satisfied */ if ((job_resp->node_list == NULL) || (strlen(job_resp->node_list) == 0)) { printf("Waiting for resource allocation\n"); fflush(stdout); while ((job_resp->node_list == NULL) || (strlen(job_resp->node_list) == 0)) { sleep(5); if (slurm_allocation_lookup_lite(job_resp->job_id, &job_resp) && (slurm_get_errno() != ESLURM_JOB_PENDING)) { slurm_perror("slurm_confirm_allocation"); exit(0); } } } nodes = job_resp->node_cnt; if (argc > 3) tasks = atoi(argv[3]); if (tasks < 1) tasks = nodes * TASKS_PER_NODE; if (tasks < nodes) { fprintf(stderr, "Invalid task count argument\n"); exit(1); } printf("Starting %d tasks on %d nodes\n", tasks, nodes); fflush(stdout); /* * Create a job step context. */ slurm_step_ctx_params_t_init(step_params); step_params->job_id = job_resp->job_id; step_params->min_nodes = nodes; step_params->task_count = tasks; ctx = slurm_step_ctx_create(step_params); if ((ctx == NULL) && (slurm_get_errno() == ESLURM_PROLOG_RUNNING)) { printf("SlurmctldProlog is still running, " "sleep and try again\n"); sleep(10); ctx = slurm_step_ctx_create(step_params); } if (ctx == NULL) { slurm_perror("slurm_step_ctx_create"); rc = 1; goto done; } /* * Hack to run one task per node, regardless of what we set up * when we created the job step context. */ if (slurm_step_ctx_daemon_per_node_hack(ctx) != SLURM_SUCCESS) { slurm_perror("slurm_step_ctx_daemon_per_node_hack"); rc = 1; goto done; } /* * Launch the tasks using "user managed" IO. * "user managed" IO means a TCP stream for each task, directly * connected to the stdin, stdout, and stderr the task. */ slurm_step_launch_params_t_init(launch); task_argv[0] = "./test7.3.io"; launch->argv = task_argv; launch->argc = 1; launch->user_managed_io = true; /* This is the key to using "user managed" IO */ if (slurm_step_launch(ctx, launch, NULL) != SLURM_SUCCESS) { slurm_perror("slurm_step_launch"); rc = 1; goto done; } if (slurm_step_launch_wait_start(ctx) != SLURM_SUCCESS) { slurm_perror("slurm_step_launch_wait_start"); rc =1; goto done; } slurm_step_ctx_get(ctx, SLURM_STEP_CTX_USER_MANAGED_SOCKETS, &num_fd, &fd_array); /* Interact with launched tasks as desired */ _do_task_work(fd_array, tasks); for (i = 0; i < tasks; i++) { close(fd_array[i]); } slurm_step_launch_wait_finish(ctx); /* Terminate the job killing all tasks */ done: slurm_kill_job(job_resp->job_id, SIGKILL, 0); /* clean up storage */ slurm_free_resource_allocation_response_msg(job_resp); if (ctx) slurm_step_ctx_destroy(ctx); exit(0); }