Beispiel #1
0
int 
main (int argc, char *argv[]) 
{
	int error_code = 0, i;

	if (argc < 2) {
		printf ("Usage: %s job_id\n", argv[0]);
		exit (1);
	}

	for (i=1; i<argc; i++) {
		error_code = slurm_kill_job ((uint32_t) atoi(argv[i]), 
							     SIGKILL, 0);
		if (error_code) {
			char msg[64];
			sprintf(msg, "slurm_kill_job(%.12s)",argv[i]);
			slurm_perror (msg);
		}
	}

	return (error_code);
}
Beispiel #2
0
int main (int argc, char *argv[])
{
	int i, min_nodes = 1, max_nodes = 1, nodes, tasks = 0, rc = 0;
	job_desc_msg_t job_req;
	resource_allocation_response_msg_t *job_resp;
	slurm_step_ctx_params_t step_params[1];
	slurm_step_ctx_t *ctx = NULL;
	slurm_step_launch_params_t launch[1];
	char *task_argv[3];
	int *fd_array = NULL;
	int num_fd;

	if (argc > 1) {
		i = atoi(argv[1]);
		if (i > 0)
			min_nodes = i;
	}
	if (argc > 2) {
		i = atoi(argv[2]);
		if (i > 0)
			max_nodes = i;
	}
	if (max_nodes < min_nodes)
		max_nodes = min_nodes;

	/* Create a job allocation */
	slurm_init_job_desc_msg( &job_req );
	job_req.min_nodes  = min_nodes;
	job_req.max_nodes  = max_nodes;
	job_req.user_id    = getuid();
	job_req.group_id   = getgid();
	job_req.time_limit = 1;
	if (slurm_allocate_resources(&job_req, &job_resp)) {
		slurm_perror ("slurm_allocate_resources");
		printf("INFO: min_nodes=%u max_nodes=%u user_id=%u group_id=%u",
		       job_req.min_nodes, job_req.max_nodes,
		       job_req.user_id, job_req.group_id);
		exit(0);
	}
	printf("job_id %u\n", job_resp->job_id);
	fflush(stdout);

	/* Wait for allocation request to be satisfied */
	if ((job_resp->node_list == NULL) ||
	    (strlen(job_resp->node_list) == 0)) {
		printf("Waiting for resource allocation\n");
		fflush(stdout);
		while ((job_resp->node_list == NULL) ||
		       (strlen(job_resp->node_list) == 0)) {
			sleep(5);
			if (slurm_allocation_lookup_lite(job_resp->job_id,
							 &job_resp) &&
			    (slurm_get_errno() != ESLURM_JOB_PENDING)) {
				slurm_perror("slurm_confirm_allocation");
				exit(0);
			}
		}
	}
	nodes = job_resp->node_cnt;
	if (argc > 3)
		tasks = atoi(argv[3]);
	if (tasks < 1)
		tasks = nodes * TASKS_PER_NODE;
	if (tasks < nodes) {
		fprintf(stderr, "Invalid task count argument\n");
		exit(1);
	}
	printf("Starting %d tasks on %d nodes\n", tasks, nodes);
	fflush(stdout);

	/*
	 * Create a job step context.
	 */
	slurm_step_ctx_params_t_init(step_params);
	step_params->job_id = job_resp->job_id;
	step_params->min_nodes = nodes;
	step_params->task_count = tasks;

	ctx = slurm_step_ctx_create(step_params);
	if ((ctx == NULL) &&
	    (slurm_get_errno() == ESLURM_PROLOG_RUNNING)) {
		printf("SlurmctldProlog is still running, "
		       "sleep and try again\n");
		sleep(10);
		ctx = slurm_step_ctx_create(step_params);
	}
	if (ctx == NULL) {
		slurm_perror("slurm_step_ctx_create");
		rc = 1;
		goto done;
	}

	/*
	 * Hack to run one task per node, regardless of what we set up
	 * when we created the job step context.
	 */
	if (slurm_step_ctx_daemon_per_node_hack(ctx) != SLURM_SUCCESS) {
		slurm_perror("slurm_step_ctx_daemon_per_node_hack");
		rc = 1;
		goto done;
	}

	/*
	 * Launch the tasks using "user managed" IO.
	 * "user managed" IO means a TCP stream for each task, directly
         * connected to the stdin, stdout, and stderr the task.
	 */
	slurm_step_launch_params_t_init(launch);
	task_argv[0] = "./test7.3.io";
	launch->argv = task_argv;
	launch->argc = 1;
	launch->user_managed_io = true; /* This is the key to using
					  "user managed" IO */

	if (slurm_step_launch(ctx, launch, NULL) != SLURM_SUCCESS) {
		slurm_perror("slurm_step_launch");
		rc = 1;
		goto done;
	}

	if (slurm_step_launch_wait_start(ctx) != SLURM_SUCCESS) {
		slurm_perror("slurm_step_launch_wait_start");
		rc =1;
		goto done;
	}

	slurm_step_ctx_get(ctx, SLURM_STEP_CTX_USER_MANAGED_SOCKETS,
			   &num_fd, &fd_array);

	/* Interact with launched tasks as desired */
	_do_task_work(fd_array, tasks);

	for (i = 0; i < tasks; i++) {
		close(fd_array[i]);
	}

	slurm_step_launch_wait_finish(ctx);

	/* Terminate the job killing all tasks */
done:	slurm_kill_job(job_resp->job_id, SIGKILL, 0);

	/* clean up storage */
	slurm_free_resource_allocation_response_msg(job_resp);
	if (ctx)
		slurm_step_ctx_destroy(ctx);
	exit(0);
}
Beispiel #3
0
static void
slurmdrmaa_job_control( fsd_job_t *self, int action )
{
	slurmdrmaa_job_t *slurm_self = (slurmdrmaa_job_t*)self;
	job_desc_msg_t job_desc;

	fsd_log_enter(( "({job_id=%s}, action=%d)", self->job_id, action ));

	fsd_mutex_lock( &self->session->drm_connection_mutex );
	TRY
	 {
		switch( action )
		 {
			case DRMAA_CONTROL_SUSPEND:
				if(slurm_suspend(fsd_atoi(self->job_id)) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_suspend error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				slurm_self->user_suspended = true;
				break;
			case DRMAA_CONTROL_HOLD:
				/* change priority to 0*/
				slurm_init_job_desc_msg(&job_desc);
				slurm_self->old_priority = job_desc.priority;
				job_desc.job_id = atoi(self->job_id);
				job_desc.priority = 0;
				job_desc.alloc_sid = 0;
				if(slurm_update_job(&job_desc) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			case DRMAA_CONTROL_RESUME:
				if(slurm_resume(fsd_atoi(self->job_id)) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_resume error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				slurm_self->user_suspended = false;
				break;
			case DRMAA_CONTROL_RELEASE:
			  /* change priority back*/
			  	slurm_init_job_desc_msg(&job_desc);
				job_desc.priority = INFINITE;
				job_desc.job_id = atoi(self->job_id);
				if(slurm_update_job(&job_desc) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_update_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			case DRMAA_CONTROL_TERMINATE:
				if(slurm_kill_job(fsd_atoi(self->job_id),SIGKILL,0) == -1) {
					fsd_exc_raise_fmt(	FSD_ERRNO_INTERNAL_ERROR,"slurm_terminate_job error: %s,job_id: %s",slurm_strerror(slurm_get_errno()),self->job_id);
				}
				break;
			default:
				fsd_exc_raise_fmt(
						FSD_ERRNO_INVALID_ARGUMENT,
						"job::control: unknown action %d", action );
		 }
					
		fsd_log_debug(("job::control: successful"));
	 }
	FINALLY
	 {
		fsd_mutex_unlock( &self->session->drm_connection_mutex );
	 }
	END_TRY

	fsd_log_return(( "" ));
}