Beispiel #1
0
Datei: req.c Projekt: VURM/slurm
static int
_handle_signal_container(int fd, slurmd_job_t *job, uid_t uid)
{
	int rc = SLURM_SUCCESS;
	int errnum = 0;
	int sig;
	static int msg_sent = 0;

	debug("_handle_signal_container for job %u.%u",
	      job->jobid, job->stepid);

	safe_read(fd, &sig, sizeof(int));

	debug3("  uid = %d", uid);
	if (uid != job->uid && !_slurm_authorized_user(uid)) {
		debug("kill container req from uid %ld for job %u.%u "
		      "owned by uid %ld",
		      (long)uid, job->jobid, job->stepid, (long)job->uid);
		rc = -1;
		errnum = EPERM;
		goto done;
	}

	/*
	 * Sanity checks
	 */
	if (job->cont_id == 0) {
		debug ("step %u.%u invalid container [cont_id:%"PRIu64"]",
			job->jobid, job->stepid, job->cont_id);
		rc = -1;
		errnum = ESLURMD_JOB_NOTRUNNING;
		goto done;
	}

	if ((job->nodeid == 0) && (msg_sent == 0) &&
	    (job->state < SLURMSTEPD_STEP_ENDING)) {
		time_t now = time(NULL);
		char entity[24], time_str[24];
		if (job->stepid == SLURM_BATCH_SCRIPT) {
			snprintf(entity, sizeof(entity), "JOB %u", job->jobid);
		} else {
			snprintf(entity, sizeof(entity), "STEP %u.%u",
				 job->jobid, job->stepid);
		}
		slurm_make_time_str(&now, time_str, sizeof(time_str));

		/* Not really errors,
		 * but we want messages displayed by default */
		if (sig == SIG_TIME_LIMIT) {
			error("*** %s CANCELLED AT %s DUE TO TIME LIMIT ***",
			      entity, time_str);
			msg_sent = 1;
		} else if (sig == SIG_PREEMPTED) {
			error("*** %s CANCELLED AT %s DUE TO PREEMPTION ***",
			      entity, time_str);
			msg_sent = 1;
		} else if (sig == SIG_NODE_FAIL) {
			error("*** %s CANCELLED AT %s DUE TO NODE FAILURE ***",
			      entity, time_str);
			msg_sent = 1;
		} else if (sig == SIG_FAILURE) {
			error("*** %s FAILED (non-zero exit code or other "
			      "failure mode) ***", entity);
			msg_sent = 1;
		} else if ((sig == SIGTERM) || (sig == SIGKILL)) {
			error("*** %s CANCELLED AT %s ***", entity, time_str);
			msg_sent = 1;
		}
	}
	if ((sig == SIG_TIME_LIMIT) || (sig == SIG_NODE_FAIL) ||
	    (sig == SIG_PREEMPTED)  || (sig == SIG_FAILURE))
		goto done;
	if (sig == SIG_DEBUG_WAKE) {
		int i;
		for (i = 0; i < job->node_tasks; i++)
			pdebug_wake_process(job, job->task[i]->pid);
		goto done;
	}
	if (sig == SIG_ABORT) {
		sig = SIGKILL;
		job->aborted = true;
	}

	pthread_mutex_lock(&suspend_mutex);
	if (suspended && (sig != SIGKILL)) {
		rc = -1;
		errnum = ESLURMD_STEP_SUSPENDED;
		pthread_mutex_unlock(&suspend_mutex);
		goto done;
	}

	/*
	 * Signal the container
	 */
	if (slurm_container_signal(job->cont_id, sig) < 0) {
		rc = -1;
		errnum = errno;
		verbose("Error sending signal %d to %u.%u: %m",
			sig, job->jobid, job->stepid);
	} else {
		verbose("Sent signal %d to %u.%u",
			sig, job->jobid, job->stepid);
	}
	pthread_mutex_unlock(&suspend_mutex);

done:
	/* Send the return code and errnum */
	safe_write(fd, &rc, sizeof(int));
	safe_write(fd, &errnum, sizeof(int));
	return SLURM_SUCCESS;
rwfail:
	return SLURM_FAILURE;
}
Beispiel #2
0
static int
_handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid)
{
	int rc = SLURM_SUCCESS;
	int errnum = 0;
	int sig;
	static int msg_sent = 0;
	char *ptr = NULL;
	int target_node_id = 0;
	stepd_step_task_info_t *task;
	uint32_t i;

	safe_read(fd, &sig, sizeof(int));
	debug("_handle_signal_container for step=%u.%u uid=%d signal=%d",
	      job->jobid, job->stepid, (int) uid, sig);
	if ((uid != job->uid) && !_slurm_authorized_user(uid)) {
		error("signal container req from uid %ld for step=%u.%u "
		      "owned by uid %ld",
		      (long)uid, job->jobid, job->stepid, (long)job->uid);
		rc = -1;
		errnum = EPERM;
		goto done;
	}

	/*
	 * Sanity checks
	 */
	if (job->cont_id == 0) {
		debug ("step %u.%u invalid container [cont_id:%"PRIu64"]",
			job->jobid, job->stepid, job->cont_id);
		rc = -1;
		errnum = ESLURMD_JOB_NOTRUNNING;
		goto done;
	}

	if ((sig == SIGTERM) || (sig == SIGKILL)) {
		/* cycle thru the tasks and mark those that have not
		 * called abort and/or terminated as killed_by_cmd
		 */
		for (i = 0; i < job->node_tasks; i++) {
			if (NULL == (task = job->task[i])) {
				continue;
			}
			if (task->aborted || task->exited) {
				continue;
			}
			/* mark that this task is going to be killed by
			 * cmd so we ignore its exit status - otherwise,
			 * we will probably report the final exit status
			 * as SIGKILL
			 */
			task->killed_by_cmd = true;
		}
	}

	ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID");
	if (ptr)
		target_node_id = atoi(ptr);
	if ((job->nodeid == target_node_id) && (msg_sent == 0) &&
	    (job->state < SLURMSTEPD_STEP_ENDING)) {
		time_t now = time(NULL);
		char entity[24], time_str[24];
		if (job->stepid == SLURM_BATCH_SCRIPT) {
			snprintf(entity, sizeof(entity), "JOB %u", job->jobid);
		} else {
			snprintf(entity, sizeof(entity), "STEP %u.%u",
				 job->jobid, job->stepid);
		}
		slurm_make_time_str(&now, time_str, sizeof(time_str));

		/* Not really errors,
		 * but we want messages displayed by default */
		if (sig == SIG_TIME_LIMIT) {
			error("*** %s CANCELLED AT %s DUE TO TIME LIMIT ***",
			      entity, time_str);
			msg_sent = 1;
		} else if (sig == SIG_PREEMPTED) {
			error("*** %s CANCELLED AT %s DUE TO PREEMPTION ***",
			      entity, time_str);
			msg_sent = 1;
		} else if (sig == SIG_NODE_FAIL) {
			error("*** %s CANCELLED AT %s DUE TO NODE FAILURE ***",
			      entity, time_str);
			msg_sent = 1;
		} else if (sig == SIG_FAILURE) {
			error("*** %s FAILED (non-zero exit code or other "
			      "failure mode) ***", entity);
			msg_sent = 1;
		} else if ((sig == SIGTERM) || (sig == SIGKILL)) {
			error("*** %s CANCELLED AT %s ***", entity, time_str);
			msg_sent = 1;
		}
	}
	if ((sig == SIG_TIME_LIMIT) || (sig == SIG_NODE_FAIL) ||
	    (sig == SIG_PREEMPTED)  || (sig == SIG_FAILURE))
		goto done;

	if (sig == SIG_ABORT) {
		sig = SIGKILL;
		job->aborted = true;
	}

	pthread_mutex_lock(&suspend_mutex);
	if (suspended && (sig != SIGKILL)) {
		rc = -1;
		errnum = ESLURMD_STEP_SUSPENDED;
		pthread_mutex_unlock(&suspend_mutex);
		goto done;
	}

	if (sig == SIG_DEBUG_WAKE) {
		int i;
		for (i = 0; i < job->node_tasks; i++)
			pdebug_wake_process(job, job->task[i]->pid);
		pthread_mutex_unlock(&suspend_mutex);
		goto done;
	}

	/*
	 * Signal the container
	 */
	if (proctrack_g_signal(job->cont_id, sig) < 0) {
		rc = -1;
		errnum = errno;
		verbose("Error sending signal %d to %u.%u: %m",
			sig, job->jobid, job->stepid);
	} else {
		verbose("Sent signal %d to %u.%u",
			sig, job->jobid, job->stepid);
	}
	pthread_mutex_unlock(&suspend_mutex);

done:
	/* Send the return code and errnum */
	safe_write(fd, &rc, sizeof(int));
	safe_write(fd, &errnum, sizeof(int));
	return SLURM_SUCCESS;
rwfail:
	return SLURM_FAILURE;
}