Esempio n. 1
0
/*
 * srun_user_message - Send arbitrary message to an srun job (no job steps)
 */
extern int srun_user_message(struct job_record *job_ptr, char *msg)
{
	slurm_addr_t * addr;
	srun_user_msg_t *msg_arg;

	xassert(job_ptr);
	if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr))
		return ESLURM_ALREADY_DONE;

	if (job_ptr->other_port &&
	    job_ptr->resp_host && job_ptr->resp_host[0]) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(srun_user_msg_t));
		msg_arg->job_id = job_ptr->job_id;
		msg_arg->msg    = xstrdup(msg);
		_srun_agent_launch(addr, job_ptr->resp_host, SRUN_USER_MSG,
				   msg_arg);
		return SLURM_SUCCESS;
	} else if (job_ptr->batch_flag && IS_JOB_RUNNING(job_ptr)) {
#ifndef HAVE_FRONT_END
		struct node_record *node_ptr;
#endif
		job_notify_msg_t *notify_msg_ptr;
		agent_arg_t *agent_arg_ptr;
#ifdef HAVE_FRONT_END
		if (job_ptr->batch_host == NULL)
			return ESLURM_DISABLED;	/* no allocated nodes */
		agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t));
		agent_arg_ptr->hostlist = hostlist_create(job_ptr->batch_host);
#else
		node_ptr = find_first_node_record(job_ptr->node_bitmap);
		if (node_ptr == NULL)
			return ESLURM_DISABLED;	/* no allocated nodes */
		agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t));
		agent_arg_ptr->hostlist = hostlist_create(node_ptr->name);
#endif
		if (agent_arg_ptr->hostlist == NULL)
			fatal("hostlist_create: malloc failure");
		notify_msg_ptr = (job_notify_msg_t *) 
				 xmalloc(sizeof(job_notify_msg_t));
		notify_msg_ptr->job_id = job_ptr->job_id;
		notify_msg_ptr->message = xstrdup(msg);
		agent_arg_ptr->node_count = 1;
		agent_arg_ptr->retry = 0;
		agent_arg_ptr->msg_type = REQUEST_JOB_NOTIFY;
		agent_arg_ptr->msg_args = (void *) notify_msg_ptr;
		/* Launch the RPC via agent */
		agent_queue_request(agent_arg_ptr);
		return SLURM_SUCCESS;
	}
	return ESLURM_DISABLED;
}
Esempio n. 2
0
/*
 * The remainder of this file implements the standard SLURM checkpoint API.
 */
extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id,
			  struct step_record *step_ptr, uint16_t op,
			  uint16_t data, char *image_dir, time_t * event_time,
			  uint32_t *error_code, char **error_msg )
{
	int rc = SLURM_SUCCESS;
	struct check_job_info *check_ptr;
	uint16_t done_sig = 0;
	struct job_record *job_ptr;
	struct node_record *node_ptr;
	pthread_attr_t attr;
	pthread_t ckpt_agent_tid = 0;
	char *nodelist;
	struct ckpt_req *req_ptr;

	/* job/step checked already */
	job_ptr = find_job_record(job_id);
	if (!job_ptr)
		return ESLURM_INVALID_JOB_ID;
	if (step_id == SLURM_BATCH_SCRIPT) {
		check_ptr = (struct check_job_info *)job_ptr->check_job;
		node_ptr = find_first_node_record(job_ptr->node_bitmap);
		nodelist = node_ptr->name;
	} else {
		step_ptr = find_step_record(job_ptr, step_id);
		if (!step_ptr)
			return ESLURM_INVALID_JOB_ID;
		check_ptr = (struct check_job_info *)step_ptr->check_job;
		nodelist = step_ptr->step_layout->node_list;
	}
	xassert(check_ptr);

	switch (op) {
	case CHECK_ABLE:
		if (check_ptr->disabled)
			rc = ESLURM_DISABLED;
		else {
			*event_time = check_ptr->time_stamp;
			rc = SLURM_SUCCESS;
		}
		break;
	case CHECK_DISABLE:
		check_ptr->disabled++;
		break;
	case CHECK_ENABLE:
		check_ptr->disabled--;
		break;
	case CHECK_REQUEUE:
		if (step_id != SLURM_BATCH_SCRIPT) {
			rc = ESLURM_NOT_SUPPORTED;
			break;
		}
		/* no break */
	case CHECK_VACATE:
		done_sig = SIGTERM;
		/* no break */
	case CHECK_CREATE:
		if (check_ptr->disabled) {
			rc = ESLURM_DISABLED;
			break;
		}
		if (check_ptr->time_stamp != 0) {
			rc = EALREADY;
			break;
		}

		check_ptr->time_stamp = time(NULL);
		check_ptr->error_code = 0;
		xfree(check_ptr->error_msg);

		req_ptr = xmalloc(sizeof(struct ckpt_req));
		if (!req_ptr) {
			rc = ENOMEM;
			break;
		}
		req_ptr->gid = job_ptr->group_id;
		req_ptr->uid = job_ptr->user_id;
		req_ptr->job_id = job_id;
		req_ptr->step_id = step_id;
		req_ptr->begin_time = check_ptr->time_stamp;
		req_ptr->wait = data;
		req_ptr->image_dir = xstrdup(image_dir);
		req_ptr->nodelist = xstrdup(nodelist);
		req_ptr->sig_done = done_sig;
		req_ptr->op = op;

		slurm_attr_init(&attr);
		if (pthread_attr_setdetachstate(&attr,
						PTHREAD_CREATE_DETACHED)) {
			error("pthread_attr_setdetachstate: %m");
			rc = errno;
			break;
		}

		if (pthread_create(&ckpt_agent_tid, &attr, _ckpt_agent_thr,
				   req_ptr)) {
			error("pthread_create: %m");
			rc = errno;
			break;
		}
		slurm_attr_destroy(&attr);

		break;

	case CHECK_RESTART:
		if (step_id != SLURM_BATCH_SCRIPT) {
			rc = ESLURM_NOT_SUPPORTED;
			break;
		}
		/* create a batch job from saved desc */
		rc = ESLURM_NOT_SUPPORTED;
		/* TODO: save job script */
		break;

	case CHECK_ERROR:
		xassert(error_code);
		xassert(error_msg);
		*error_code = check_ptr->error_code;
		xfree(*error_msg);
		*error_msg = xstrdup(check_ptr->error_msg);
		break;
	default:
		error("Invalid checkpoint operation: %d", op);
		rc = EINVAL;
	}

	return rc;
}