예제 #1
0
extern int launch_p_step_wait(srun_job_t *job, bool got_alloc)
{
	int rc = 0;

	slurm_step_launch_wait_finish(job->step_ctx);
	if ((MPIR_being_debugged == 0) && retry_step_begin &&
	    (retry_step_cnt < MAX_STEP_RETRIES)) {
		retry_step_begin = false;
		slurm_step_ctx_destroy(job->step_ctx);
		if (got_alloc) {
			if (create_job_step(job, true) < 0)
				exit(error_exit);
		} else {
			if (create_job_step(job, false) < 0)
				exit(error_exit);
		}
		task_state_destroy(task_state);
		rc = -1;
	}
	return rc;
}
예제 #2
0
파일: test7.3.prog.c 프로젝트: VURM/slurm
int main (int argc, char *argv[])
{
	int i, min_nodes = 1, max_nodes = 1, nodes, tasks = 0, rc = 0;
	job_desc_msg_t job_req;
	resource_allocation_response_msg_t *job_resp;
	slurm_step_ctx_params_t step_params[1];
	slurm_step_ctx_t *ctx = NULL;
	slurm_step_launch_params_t launch[1];
	char *task_argv[3];
	int *fd_array = NULL;
	int num_fd;

	if (argc > 1) {
		i = atoi(argv[1]);
		if (i > 0)
			min_nodes = i;
	}
	if (argc > 2) {
		i = atoi(argv[2]);
		if (i > 0)
			max_nodes = i;
	}
	if (max_nodes < min_nodes)
		max_nodes = min_nodes;

	/* Create a job allocation */
	slurm_init_job_desc_msg( &job_req );
	job_req.min_nodes  = min_nodes;
	job_req.max_nodes  = max_nodes;
	job_req.user_id    = getuid();
	job_req.group_id   = getgid();
	job_req.time_limit = 1;
	if (slurm_allocate_resources(&job_req, &job_resp)) {
		slurm_perror ("slurm_allocate_resources");
		printf("INFO: min_nodes=%u max_nodes=%u user_id=%u group_id=%u",
		       job_req.min_nodes, job_req.max_nodes,
		       job_req.user_id, job_req.group_id);
		exit(0);
	}
	printf("job_id %u\n", job_resp->job_id);
	fflush(stdout);

	/* Wait for allocation request to be satisfied */
	if ((job_resp->node_list == NULL) ||
	    (strlen(job_resp->node_list) == 0)) {
		printf("Waiting for resource allocation\n");
		fflush(stdout);
		while ((job_resp->node_list == NULL) ||
		       (strlen(job_resp->node_list) == 0)) {
			sleep(5);
			if (slurm_allocation_lookup_lite(job_resp->job_id,
							 &job_resp) &&
			    (slurm_get_errno() != ESLURM_JOB_PENDING)) {
				slurm_perror("slurm_confirm_allocation");
				exit(0);
			}
		}
	}
	nodes = job_resp->node_cnt;
	if (argc > 3)
		tasks = atoi(argv[3]);
	if (tasks < 1)
		tasks = nodes * TASKS_PER_NODE;
	if (tasks < nodes) {
		fprintf(stderr, "Invalid task count argument\n");
		exit(1);
	}
	printf("Starting %d tasks on %d nodes\n", tasks, nodes);
	fflush(stdout);

	/*
	 * Create a job step context.
	 */
	slurm_step_ctx_params_t_init(step_params);
	step_params->job_id = job_resp->job_id;
	step_params->min_nodes = nodes;
	step_params->task_count = tasks;

	ctx = slurm_step_ctx_create(step_params);
	if ((ctx == NULL) &&
	    (slurm_get_errno() == ESLURM_PROLOG_RUNNING)) {
		printf("SlurmctldProlog is still running, "
		       "sleep and try again\n");
		sleep(10);
		ctx = slurm_step_ctx_create(step_params);
	}
	if (ctx == NULL) {
		slurm_perror("slurm_step_ctx_create");
		rc = 1;
		goto done;
	}

	/*
	 * Hack to run one task per node, regardless of what we set up
	 * when we created the job step context.
	 */
	if (slurm_step_ctx_daemon_per_node_hack(ctx) != SLURM_SUCCESS) {
		slurm_perror("slurm_step_ctx_daemon_per_node_hack");
		rc = 1;
		goto done;
	}

	/*
	 * Launch the tasks using "user managed" IO.
	 * "user managed" IO means a TCP stream for each task, directly
         * connected to the stdin, stdout, and stderr the task.
	 */
	slurm_step_launch_params_t_init(launch);
	task_argv[0] = "./test7.3.io";
	launch->argv = task_argv;
	launch->argc = 1;
	launch->user_managed_io = true; /* This is the key to using
					  "user managed" IO */

	if (slurm_step_launch(ctx, launch, NULL) != SLURM_SUCCESS) {
		slurm_perror("slurm_step_launch");
		rc = 1;
		goto done;
	}

	if (slurm_step_launch_wait_start(ctx) != SLURM_SUCCESS) {
		slurm_perror("slurm_step_launch_wait_start");
		rc =1;
		goto done;
	}

	slurm_step_ctx_get(ctx, SLURM_STEP_CTX_USER_MANAGED_SOCKETS,
			   &num_fd, &fd_array);

	/* Interact with launched tasks as desired */
	_do_task_work(fd_array, tasks);

	for (i = 0; i < tasks; i++) {
		close(fd_array[i]);
	}

	slurm_step_launch_wait_finish(ctx);

	/* Terminate the job killing all tasks */
done:	slurm_kill_job(job_resp->job_id, SIGKILL, 0);

	/* clean up storage */
	slurm_free_resource_allocation_response_msg(job_resp);
	if (ctx)
		slurm_step_ctx_destroy(ctx);
	exit(0);
}