コード例 #1
0
ファイル: srun_comm.c プロジェクト: chrisdukey/slurm
/*
 * srun_node_fail - notify srun of a node's failure
 * IN job_ptr - job to notify
 * IN node_name - name of failed node
 */
extern void srun_node_fail(struct job_record *job_ptr, char *node_name)
{
#ifndef HAVE_FRONT_END
	struct node_record *node_ptr;
#endif
	int bit_position = -1;
	slurm_addr_t * addr;
	srun_node_fail_msg_t *msg_arg;
	ListIterator step_iterator;
	struct step_record *step_ptr;

	xassert(job_ptr);
	xassert(node_name);
	if (!job_ptr || !IS_JOB_RUNNING(job_ptr))
		return;

#ifdef HAVE_FRONT_END
	/* Purge all jobs steps in front end mode */
#else
	if (!node_name || (node_ptr = find_node_record(node_name)) == NULL)
		return;
	bit_position = node_ptr - node_record_table_ptr;
#endif

	step_iterator = list_iterator_create(job_ptr->step_list);
	while ((step_ptr = (struct step_record *) list_next(step_iterator))) {
		if (step_ptr->step_node_bitmap == NULL)   /* pending step */
			continue;
		if ((bit_position >= 0) &&
		    (!bit_test(step_ptr->step_node_bitmap, bit_position)))
			continue;	/* job step not on this node */
		if ( (step_ptr->port    == 0)    ||
		     (step_ptr->host    == NULL) ||
		     (step_ptr->batch_step)      ||
		     (step_ptr->host[0] == '\0') )
			continue;
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, step_ptr->port, step_ptr->host);
		msg_arg = xmalloc(sizeof(srun_node_fail_msg_t));
		msg_arg->job_id   = job_ptr->job_id;
		msg_arg->step_id  = step_ptr->step_id;
		msg_arg->nodelist = xstrdup(node_name);
		_srun_agent_launch(addr, step_ptr->host, SRUN_NODE_FAIL,
				   msg_arg, step_ptr->start_protocol_ver);
	}
	list_iterator_destroy(step_iterator);

	if (job_ptr->other_port && job_ptr->alloc_node && job_ptr->resp_host) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(srun_node_fail_msg_t));
		msg_arg->job_id   = job_ptr->job_id;
		msg_arg->step_id  = NO_VAL;
		msg_arg->nodelist = xstrdup(node_name);
		_srun_agent_launch(addr, job_ptr->alloc_node, SRUN_NODE_FAIL,
				   msg_arg, job_ptr->start_protocol_ver);
	}
}
コード例 #2
0
ファイル: srun_comm.c プロジェクト: chrisdukey/slurm
/*
 * srun_allocate - notify srun of a resource allocation
 * IN job_ptr - job allocated resources
 */
extern void srun_allocate(struct job_record *job_ptr)
{
	struct job_record *pack_job, *pack_leader;
	resource_allocation_response_msg_t *msg_arg = NULL;
	slurm_addr_t *addr;
	ListIterator iter;
	List job_resp_list = NULL;

	xassert(job_ptr);
	if (!job_ptr || !job_ptr->alloc_resp_port || !job_ptr->alloc_node ||
	    !job_ptr->resp_host || !job_ptr->job_resrcs ||
	    !job_ptr->job_resrcs->cpu_array_cnt)
		return;

	if (job_ptr->pack_job_id == 0) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->alloc_resp_port,
			job_ptr->resp_host);

		msg_arg = build_alloc_msg(job_ptr, SLURM_SUCCESS, NULL);
		_srun_agent_launch(addr, job_ptr->alloc_node,
				   RESPONSE_RESOURCE_ALLOCATION, msg_arg,
				   job_ptr->start_protocol_ver);
	} else if (_pending_pack_jobs(job_ptr)) {
		return;
	} else if ((pack_leader = find_job_record(job_ptr->pack_job_id))) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, pack_leader->alloc_resp_port,
			       pack_leader->resp_host);
		job_resp_list = list_create(_free_srun_alloc);
		iter = list_iterator_create(pack_leader->pack_job_list);
		while ((pack_job = (struct job_record *) list_next(iter))) {
			if (pack_leader->pack_job_id != pack_job->pack_job_id) {
				error("%s: Bad pack_job_list for %pJ",
				      __func__, pack_leader);
				continue;
			}
			msg_arg = build_alloc_msg(pack_job, SLURM_SUCCESS,
						  NULL);
			list_append(job_resp_list, msg_arg);
			msg_arg = NULL;
		}
		list_iterator_destroy(iter);
		_srun_agent_launch(addr, job_ptr->alloc_node,
				   RESPONSE_JOB_PACK_ALLOCATION, job_resp_list,
				   job_ptr->start_protocol_ver);
	} else {
		error("%s: Can not find pack job leader %pJ",
		      __func__, job_ptr);
	}
}
コード例 #3
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_exec - request that srun execute a specific command
 *	and route it's output to stdout
 * IN step_ptr - pointer to the slurmctld job step record
 * IN argv - command and arguments to execute
 */
extern void srun_exec(struct step_record *step_ptr, char **argv)
{
	slurm_addr_t * addr;
	srun_exec_msg_t *msg_arg;
	int cnt = 1, i;

	xassert(step_ptr);

	if (step_ptr->port && step_ptr->host && step_ptr->host[0]) {
		for (i=0; argv[i]; i++)
			cnt++;	/* start at 1 to include trailing NULL */
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, step_ptr->port, step_ptr->host);
		msg_arg = xmalloc(sizeof(srun_exec_msg_t));
		msg_arg->job_id  = step_ptr->job_ptr->job_id;
		msg_arg->step_id = step_ptr->step_id;
		msg_arg->argc    = cnt;
		msg_arg->argv    = xmalloc(sizeof(char *) * cnt);
		for (i=0; i<cnt ; i++)
			msg_arg->argv[i] = xstrdup(argv[i]);
		_srun_agent_launch(addr, step_ptr->host, SRUN_EXEC,
				   msg_arg);
	} else {
		error("srun_exec %u.%u lacks communication channel",
			step_ptr->job_ptr->job_id, step_ptr->step_id);
	}
}
コード例 #4
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_job_complete - notify srun of a job's termination
 * IN job_ptr - pointer to the slurmctld job record
 */
extern void srun_job_complete (struct job_record *job_ptr)
{
	slurm_addr_t * addr;
	srun_job_complete_msg_t *msg_arg;
	ListIterator step_iterator;
	struct step_record *step_ptr;

	xassert(job_ptr);

	if (job_ptr->other_port && job_ptr->alloc_node && job_ptr->resp_host) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(srun_job_complete_msg_t));
		msg_arg->job_id   = job_ptr->job_id;
		msg_arg->step_id  = NO_VAL;
		_srun_agent_launch(addr, job_ptr->alloc_node,
				   SRUN_JOB_COMPLETE, msg_arg);
	}

	step_iterator = list_iterator_create(job_ptr->step_list);
	while ((step_ptr = (struct step_record *) list_next(step_iterator))) {
		if (step_ptr->batch_step)	/* batch script itself */
			continue;
		srun_step_complete(step_ptr);
	}
	list_iterator_destroy(step_iterator);
}
コード例 #5
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/* srun_ping - ping all srun commands that have not been heard from recently */
extern void srun_ping (void)
{
	ListIterator job_iterator;
	struct job_record *job_ptr;
	slurm_addr_t * addr;
	time_t now = time(NULL);
	time_t old = now - (slurmctld_conf.inactive_limit / 3) +
			   slurmctld_conf.msg_timeout + 1;
	srun_ping_msg_t *msg_arg;

	if (slurmctld_conf.inactive_limit == 0)
		return;		/* No limit, don't bother pinging */

	job_iterator = list_iterator_create(job_list);
	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
		xassert (job_ptr->magic == JOB_MAGIC);

		if (!IS_JOB_RUNNING(job_ptr))
			continue;

		if ((job_ptr->time_last_active <= old) && job_ptr->other_port
		    &&  job_ptr->alloc_node && job_ptr->resp_host) {
			addr = xmalloc(sizeof(struct sockaddr_in));
			slurm_set_addr(addr, job_ptr->other_port,
				job_ptr->resp_host);
			msg_arg = xmalloc(sizeof(srun_ping_msg_t));
			msg_arg->job_id  = job_ptr->job_id;
			msg_arg->step_id = NO_VAL;
			_srun_agent_launch(addr, job_ptr->alloc_node,
					   SRUN_PING, msg_arg);
		}
	}

	list_iterator_destroy(job_iterator);
}
コード例 #6
0
ファイル: srun_comm.c プロジェクト: chrisdukey/slurm
/*
 * srun_timeout - notify srun of a job's imminent timeout
 * IN job_ptr - pointer to the slurmctld job record
 */
extern void srun_timeout (struct job_record *job_ptr)
{
	slurm_addr_t * addr;
	srun_timeout_msg_t *msg_arg;
	ListIterator step_iterator;
	struct step_record *step_ptr;

	xassert(job_ptr);
	if (!IS_JOB_RUNNING(job_ptr))
		return;

	if (job_ptr->other_port && job_ptr->alloc_node && job_ptr->resp_host) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(srun_timeout_msg_t));
		msg_arg->job_id   = job_ptr->job_id;
		msg_arg->step_id  = NO_VAL;
		msg_arg->timeout  = job_ptr->end_time;
		_srun_agent_launch(addr, job_ptr->alloc_node, SRUN_TIMEOUT,
				   msg_arg, job_ptr->start_protocol_ver);
	}


	step_iterator = list_iterator_create(job_ptr->step_list);
	while ((step_ptr = (struct step_record *) list_next(step_iterator)))
		srun_step_timeout(step_ptr, job_ptr->end_time);
	list_iterator_destroy(step_iterator);
}
コード例 #7
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_timeout - notify srun of a job's imminent timeout
 * IN job_ptr - pointer to the slurmctld job record
 */
extern void srun_timeout (struct job_record *job_ptr)
{
	slurm_addr_t * addr;
	srun_timeout_msg_t *msg_arg;
	ListIterator step_iterator;
	struct step_record *step_ptr;

	xassert(job_ptr);
	if (!IS_JOB_RUNNING(job_ptr))
		return;

	if (job_ptr->other_port && job_ptr->alloc_node && job_ptr->resp_host) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(srun_timeout_msg_t));
		msg_arg->job_id   = job_ptr->job_id;
		msg_arg->step_id  = NO_VAL;
		msg_arg->timeout  = job_ptr->end_time;
		_srun_agent_launch(addr, job_ptr->alloc_node, SRUN_TIMEOUT,
				   msg_arg);
	}


	step_iterator = list_iterator_create(job_ptr->step_list);
	while ((step_ptr = (struct step_record *) list_next(step_iterator))) {
		if ( (step_ptr->port    == 0)    ||
		     (step_ptr->host    == NULL) ||
		     (step_ptr->batch_step)      ||
		     (step_ptr->host[0] == '\0') )
			continue;
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, step_ptr->port, step_ptr->host);
		msg_arg = xmalloc(sizeof(srun_timeout_msg_t));
		msg_arg->job_id   = job_ptr->job_id;
		msg_arg->step_id  = step_ptr->step_id;
		msg_arg->timeout  = job_ptr->end_time;
		_srun_agent_launch(addr, step_ptr->host, SRUN_TIMEOUT,
				   msg_arg);
	}
	list_iterator_destroy(step_iterator);
}
コード例 #8
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_user_message - Send arbitrary message to an srun job (no job steps)
 */
extern int srun_user_message(struct job_record *job_ptr, char *msg)
{
	slurm_addr_t * addr;
	srun_user_msg_t *msg_arg;

	xassert(job_ptr);
	if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr))
		return ESLURM_ALREADY_DONE;

	if (job_ptr->other_port &&
	    job_ptr->resp_host && job_ptr->resp_host[0]) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(srun_user_msg_t));
		msg_arg->job_id = job_ptr->job_id;
		msg_arg->msg    = xstrdup(msg);
		_srun_agent_launch(addr, job_ptr->resp_host, SRUN_USER_MSG,
				   msg_arg);
		return SLURM_SUCCESS;
	} else if (job_ptr->batch_flag && IS_JOB_RUNNING(job_ptr)) {
#ifndef HAVE_FRONT_END
		struct node_record *node_ptr;
#endif
		job_notify_msg_t *notify_msg_ptr;
		agent_arg_t *agent_arg_ptr;
#ifdef HAVE_FRONT_END
		if (job_ptr->batch_host == NULL)
			return ESLURM_DISABLED;	/* no allocated nodes */
		agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t));
		agent_arg_ptr->hostlist = hostlist_create(job_ptr->batch_host);
#else
		node_ptr = find_first_node_record(job_ptr->node_bitmap);
		if (node_ptr == NULL)
			return ESLURM_DISABLED;	/* no allocated nodes */
		agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t));
		agent_arg_ptr->hostlist = hostlist_create(node_ptr->name);
#endif
		if (agent_arg_ptr->hostlist == NULL)
			fatal("hostlist_create: malloc failure");
		notify_msg_ptr = (job_notify_msg_t *) 
				 xmalloc(sizeof(job_notify_msg_t));
		notify_msg_ptr->job_id = job_ptr->job_id;
		notify_msg_ptr->message = xstrdup(msg);
		agent_arg_ptr->node_count = 1;
		agent_arg_ptr->retry = 0;
		agent_arg_ptr->msg_type = REQUEST_JOB_NOTIFY;
		agent_arg_ptr->msg_args = (void *) notify_msg_ptr;
		/* Launch the RPC via agent */
		agent_queue_request(agent_arg_ptr);
		return SLURM_SUCCESS;
	}
	return ESLURM_DISABLED;
}
コード例 #9
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_step_complete - notify srun of a job step's termination
 * IN step_ptr - pointer to the slurmctld job step record
 */
extern void srun_step_complete (struct step_record *step_ptr)
{
	slurm_addr_t * addr;
	srun_job_complete_msg_t *msg_arg;

	xassert(step_ptr);
	if (step_ptr->port && step_ptr->host && step_ptr->host[0]) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, step_ptr->port, step_ptr->host);
		msg_arg = xmalloc(sizeof(srun_job_complete_msg_t));
		msg_arg->job_id   = step_ptr->job_ptr->job_id;
		msg_arg->step_id  = step_ptr->step_id;
		_srun_agent_launch(addr, step_ptr->host, SRUN_JOB_COMPLETE,
				   msg_arg);
	}
}
コード例 #10
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_step_signal - notify srun that a job step should be signalled
 * NOTE: Needed on BlueGene/Q to signal runjob process
 * IN step_ptr  - pointer to the slurmctld job step record
 * IN signal - signal number
 */
extern void srun_step_signal (struct step_record *step_ptr, uint16_t signal)
{
	slurm_addr_t * addr;
	job_step_kill_msg_t *msg_arg;

	xassert(step_ptr);
	if (step_ptr->port && step_ptr->host && step_ptr->host[0]) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, step_ptr->port, step_ptr->host);
		msg_arg = xmalloc(sizeof(job_step_kill_msg_t));
		msg_arg->job_id      = step_ptr->job_ptr->job_id;
		msg_arg->job_step_id = step_ptr->step_id;
		msg_arg->signal      = signal;
		_srun_agent_launch(addr, step_ptr->host, SRUN_STEP_SIGNAL,
				   msg_arg);
	}
}
コード例 #11
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_allocate_abort - notify srun of a resource allocation failure
 * IN job_id - id of the job allocated resource
 */
extern void srun_allocate_abort(struct job_record *job_ptr)
{
	if (job_ptr && job_ptr->alloc_resp_port && job_ptr->alloc_node
	&&  job_ptr->resp_host) {
		slurm_addr_t * addr;
		srun_job_complete_msg_t *msg_arg;
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->alloc_resp_port,
			       job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(srun_timeout_msg_t));
		msg_arg->job_id   = job_ptr->job_id;
		msg_arg->step_id  = NO_VAL;
		_srun_agent_launch(addr, job_ptr->alloc_node,
				   SRUN_JOB_COMPLETE,
				   msg_arg);
	}
}
コード例 #12
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_step_missing - notify srun that a job step is missing from
 *		       a node we expect to find it on
 * IN step_ptr  - pointer to the slurmctld job step record
 * IN node_list - name of nodes we did not find the step on
 */
extern void srun_step_missing (struct step_record *step_ptr,
			       char *node_list)
{
	slurm_addr_t * addr;
	srun_step_missing_msg_t *msg_arg;

	xassert(step_ptr);
	if (step_ptr->port && step_ptr->host && step_ptr->host[0]) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, step_ptr->port, step_ptr->host);
		msg_arg = xmalloc(sizeof(srun_step_missing_msg_t));
		msg_arg->job_id   = step_ptr->job_ptr->job_id;
		msg_arg->step_id  = step_ptr->step_id;
		msg_arg->nodelist = xstrdup(node_list);
		_srun_agent_launch(addr, step_ptr->host, SRUN_STEP_MISSING,
				   msg_arg);
	}
}
コード例 #13
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_allocate - notify srun of a resource allocation
 * IN job_id - id of the job allocated resource
 */
extern void srun_allocate (uint32_t job_id)
{
	struct job_record *job_ptr = find_job_record (job_id);

	xassert(job_ptr);
	if (job_ptr && job_ptr->alloc_resp_port && job_ptr->alloc_node &&
	    job_ptr->resp_host && job_ptr->job_resrcs &&
	    job_ptr->job_resrcs->cpu_array_cnt) {
		slurm_addr_t * addr;
		resource_allocation_response_msg_t *msg_arg;
		job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;

		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->alloc_resp_port,
			job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(resource_allocation_response_msg_t));
		msg_arg->job_id 	= job_ptr->job_id;
		msg_arg->node_list	= xstrdup(job_ptr->nodes);
		msg_arg->alias_list	= xstrdup(job_ptr->alias_list);
		msg_arg->num_cpu_groups	= job_resrcs_ptr->cpu_array_cnt;
		msg_arg->cpus_per_node  = xmalloc(sizeof(uint16_t) *
					  job_resrcs_ptr->cpu_array_cnt);
		if (job_ptr->details) {
			msg_arg->pn_min_memory = job_ptr->details->
						 pn_min_memory;
		}
		memcpy(msg_arg->cpus_per_node,
		       job_resrcs_ptr->cpu_array_value,
		       (sizeof(uint16_t) * job_resrcs_ptr->cpu_array_cnt));
		msg_arg->cpu_count_reps  = xmalloc(sizeof(uint32_t) *
					   job_resrcs_ptr->cpu_array_cnt);
		memcpy(msg_arg->cpu_count_reps,
		       job_resrcs_ptr->cpu_array_reps,
		       (sizeof(uint32_t) * job_resrcs_ptr->cpu_array_cnt));
		msg_arg->node_cnt	= job_ptr->node_cnt;
		msg_arg->select_jobinfo = select_g_select_jobinfo_copy(
				job_ptr->select_jobinfo);
		msg_arg->error_code	= SLURM_SUCCESS;
		_srun_agent_launch(addr, job_ptr->alloc_node,
				   RESPONSE_RESOURCE_ALLOCATION, msg_arg);
	}
}
コード例 #14
0
ファイル: srun_comm.c プロジェクト: Xarthisius/slurm
/*
 * srun_job_suspend - notify salloc of suspend/resume operation
 * IN job_ptr - pointer to the slurmctld job record
 * IN op - SUSPEND_JOB or RESUME_JOB (enum suspend_opts from slurm.h)
 * RET - true if message send, otherwise false
 */
extern bool srun_job_suspend (struct job_record *job_ptr, uint16_t op)
{
	slurm_addr_t * addr;
	suspend_msg_t *msg_arg;
	bool msg_sent = false;

	xassert(job_ptr);

	if (job_ptr->other_port && job_ptr->alloc_node && job_ptr->resp_host) {
		addr = xmalloc(sizeof(struct sockaddr_in));
		slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host);
		msg_arg = xmalloc(sizeof(suspend_msg_t));
		msg_arg->job_id  = job_ptr->job_id;
		msg_arg->op     = op;
		_srun_agent_launch(addr, job_ptr->alloc_node,
				   SRUN_REQUEST_SUSPEND, msg_arg);
		msg_sent = true;
	}
	return msg_sent;
}
コード例 #15
0
ファイル: srun_comm.c プロジェクト: chrisdukey/slurm
/*
 * srun_step_timeout - notify srun of a job step's imminent timeout
 * IN step_ptr - pointer to the slurmctld step record
 * IN timeout_val - when it is going to time out
 */
extern void srun_step_timeout(struct step_record *step_ptr, time_t timeout_val)
{
	slurm_addr_t *addr;
	srun_timeout_msg_t *msg_arg;

	xassert(step_ptr);

	if (step_ptr->batch_step || !step_ptr->port
	    || !step_ptr->host || (step_ptr->host[0] == '\0'))
		return;

	addr = xmalloc(sizeof(struct sockaddr_in));
	slurm_set_addr(addr, step_ptr->port, step_ptr->host);
	msg_arg = xmalloc(sizeof(srun_timeout_msg_t));
	msg_arg->job_id   = step_ptr->job_ptr->job_id;
	msg_arg->step_id  = step_ptr->step_id;
	msg_arg->timeout  = timeout_val;
	_srun_agent_launch(addr, step_ptr->host, SRUN_TIMEOUT, msg_arg,
			   step_ptr->start_protocol_ver);
}