Exemplo n.º 1
0
void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr)
{
	if (!user_mode(regs))
		die("Exception in kernel mode", regs, signo);

	_send_sig(signo, code, addr);
}
Exemplo n.º 2
0
/* Send specified signal only to the process launched on node 0.
 * If the request times out, send sig_timeout. */
static int _step_sig(struct step_record * step_ptr, uint16_t wait,
		uint16_t signal, uint16_t sig_timeout)
{
	struct check_job_info *check_ptr;
	struct job_record *job_ptr;
	int i;

	xassert(step_ptr);
	check_ptr = (struct check_job_info *) step_ptr->check_job;
	xassert(check_ptr);
	job_ptr = step_ptr->job_ptr;
	xassert(job_ptr);

	if (IS_JOB_FINISHED(job_ptr))
		return ESLURM_ALREADY_DONE;

	if (check_ptr->disabled)
		return ESLURM_DISABLED;

	check_ptr->node_cnt = 0;	/* re-calculate below */
	for (i = 0; i < node_record_count; i++) {
		if (bit_test(step_ptr->step_node_bitmap, i) == 0)
			continue;
		if (check_ptr->node_cnt++ > 0)
			continue;
		_send_sig(step_ptr->job_ptr->job_id, step_ptr->step_id,
			signal, node_record_table_ptr[i].name,
			node_record_table_ptr[i].slurm_addr);
		_ckpt_enqueue_timeout(step_ptr->job_ptr->job_id,
			step_ptr->step_id, check_ptr->time_stamp,
			sig_timeout, wait, node_record_table_ptr[i].name,
			node_record_table_ptr[i].slurm_addr);
	}

	if (!check_ptr->node_cnt) {
		error("_step_sig: job %u.%u has no nodes", job_ptr->job_id,
			step_ptr->step_id);
		return ESLURM_INVALID_NODE_NAME;
	}

	check_ptr->time_stamp = time(NULL);
	check_ptr->wait_time  = wait;

	info("checkpoint requested for job %u.%u", job_ptr->job_id,
		step_ptr->step_id);
	return SLURM_SUCCESS;
}
Exemplo n.º 3
0
Arquivo: xcpu.c Projeto: IFCA/slurm
/* Identify every XCPU process in a specific node and signal it.
 * Return the process count */
extern int xcpu_signal(int sig, char *nodes)
{
	int procs = 0;
	hostlist_t hl;
	char *node, sig_msg[64], dir_path[128], ctl_path[200];
	DIR *dir;
	struct dirent *sub_dir;

	/* Translate "nodes" to a hostlist */
	hl = hostlist_create(nodes);
	if (hl == NULL) {
		error("hostlist_create: %m");
		return 0;
	}

	/* Plan 9 only takes strings, so we map number to name */
	snprintf(sig_msg, sizeof(sig_msg), "signal %s",
		_sig_name(sig));

	/* For each node, look for processes */
	while ((node = hostlist_shift(hl))) {
		snprintf(dir_path, sizeof(dir_path), 
			"%s/%s/xcpu",
			XCPU_DIR, node);
		free(node);
		if ((dir = opendir(dir_path)) == NULL) {
			error("opendir(%s): %m", dir_path);
			continue;
		}
		while ((sub_dir = readdir(dir))) {
			snprintf(ctl_path, sizeof(ctl_path),
				"%s/%s/ctl",dir_path, 
				sub_dir->d_name);
			procs += _send_sig(ctl_path, sig, sig_msg);
		}
		closedir(dir);
	}

	hostlist_destroy(hl);
	return procs;
}
Exemplo n.º 4
0
/* Checkpoint processing pthread
 * Never returns, but is cancelled on plugin termiantion */
static void *_ckpt_agent_thr(void *arg)
{
	struct ckpt_req *req = (struct ckpt_req *)arg;
	int rc;
	/* Locks: write job */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
	struct job_record *job_ptr;
	struct step_record *step_ptr;
	struct check_job_info *check_ptr;

	/* only perform ckpt operation of ONE JOB */
	slurm_mutex_lock(&ckpt_agent_mutex);
	while (ckpt_agent_jobid && ckpt_agent_jobid != req->job_id) {
		pthread_cond_wait(&ckpt_agent_cond, &ckpt_agent_mutex);
	}
	ckpt_agent_jobid = req->job_id;
	ckpt_agent_count ++;
	slurm_mutex_unlock(&ckpt_agent_mutex);

	debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u",
	       req->op, req->job_id, req->step_id);

	rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time,
			      req->image_dir, req->wait, req->nodelist);
	if (rc != SLURM_SUCCESS) {
		error("checkpoint/blcr: error on checkpoint request %u to "
		      "%u.%u: %s", req->op, req->job_id, req->step_id,
		      slurm_strerror(rc));
	}
	if (req->op == CHECK_REQUEUE)
		_requeue_when_finished(req->job_id);

	lock_slurmctld(job_write_lock);
	job_ptr = find_job_record(req->job_id);
	if (!job_ptr) {
		error("_ckpt_agent_thr: job finished");
		goto out;
	}
	if (req->step_id == SLURM_BATCH_SCRIPT) {	/* batch job */
		check_ptr = (struct check_job_info *)job_ptr->check_job;
	} else {
		step_ptr = find_step_record(job_ptr, req->step_id);
		if (! step_ptr) {
			error("_ckpt_agent_thr: step finished");
			goto out;
		}
		check_ptr = (struct check_job_info *)step_ptr->check_job;
	}
	check_ptr->time_stamp = 0;
	check_ptr->error_code = rc;
	if (check_ptr->error_code != SLURM_SUCCESS)
		check_ptr->error_msg = xstrdup(slurm_strerror(rc));

 out:
	unlock_slurmctld(job_write_lock);

	if (req->sig_done) {
		_send_sig(req->job_id, req->step_id, req->sig_done,
			  req->nodelist);
	}

	_on_ckpt_complete(req->gid, req->uid, req->job_id, req->step_id,
			  req->image_dir, rc);

	slurm_mutex_lock(&ckpt_agent_mutex);
	ckpt_agent_count --;
	if (ckpt_agent_count == 0) {
		ckpt_agent_jobid = 0;
		pthread_cond_broadcast(&ckpt_agent_cond);
	}
	slurm_mutex_unlock(&ckpt_agent_mutex);
	_ckpt_req_free(req);
	return NULL;
}
Exemplo n.º 5
0
asmlinkage void handle_trap_3_c(struct pt_regs *fp)
{
	_send_sig(SIGILL, ILL_ILLTRP, fp->ea);
}
Exemplo n.º 6
0
asmlinkage void handle_trap_2_c(struct pt_regs *fp)
{
	_send_sig(SIGUSR2, 0, fp->ea);
}
Exemplo n.º 7
0
static void _ckpt_signal_step(struct ckpt_timeout_info *rec)
{
	/* debug("signal %u.%u %u", rec->job_id, rec->step_id, rec->signal); */
	_send_sig(rec->job_id, rec->step_id, rec->signal,
		rec->node_name, rec->node_addr);
}