示例#1
0
/*
 * Check that a child is making forward progress by comparing the timestamps it
 * recorded before making its last syscall.
 * If no progress is being made, send SIGKILLs to it.
 */
static void check_child_progress(struct childdata *child)
{
	struct syscallrecord *rec;
	struct timeval tv;
	time_t diff, old, now;
	pid_t pid;

	pid = child->pid;

	if (pid == EMPTY_PIDSLOT)
		return;

	rec = &child->syscall;

	old = rec->tv.tv_sec;

	if (old == 0)
		return;

	gettimeofday(&tv, NULL);
	now = tv.tv_sec;

	if (old > now)
		diff = old - now;
	else
		diff = now - old;

	/* hopefully the common case. */
	if (diff < 30)
		return;

	/* After 30 seconds of no progress, send a kill signal. */
	if (diff == 30) {
		stuck_syscall_info(child);
		debugf("child %d (pid %u) hasn't made progress in 30 seconds! Sending SIGKILL\n",
				child->num, pid);
		child->kill_count++;
		kill_pid(pid);
	}

	/* if we're still around after 40s, repeatedly send SIGKILLs every second. */
	if (diff < 40)
		return;

	debugf("sending another SIGKILL to child %d (pid %u). [kill count:%d] [diff:%d]\n",
		child->num, pid, child->kill_count, diff);
	child->kill_count++;
	kill_pid(pid);

	/* if we wrapped, just reset it, we'll pick it up next time around. */
	if (diff > 2146) {	/* max adjtime offset, + the 1 second since last time. */
		output(1, "child %u wrapped! old=%lu now=%lu\n", child->num, old, now);
		rec->tv.tv_sec = now;
		return;
	}
}
示例#2
0
/*
 * Check that a child is making forward progress by comparing the timestamps it
 * recorded before making its last syscall.
 * If no progress is being made, send SIGKILLs to it.
 */
static bool is_child_making_progress(struct childdata *child)
{
	struct syscallrecord *rec;
	struct timespec tp;
	time_t diff, old, now;
	pid_t pid;

	pid = child->pid;

	if (pid == EMPTY_PIDSLOT)
		return TRUE;

	rec = &child->syscall;

	old = rec->tp.tv_sec;

	/* haven't done anything yet. */
	if (old == 0)
		return TRUE;

	clock_gettime(CLOCK_MONOTONIC, &tp);
	now = tp.tv_sec;

	if (old > now)
		diff = old - now;
	else
		diff = now - old;

	/* hopefully the common case. */
	if (diff < 30)
		return TRUE;

	/* After 30 seconds of no progress, send a kill signal. */
	if (diff == 30) {
		stuck_syscall_info(child);
		debugf("child %d (pid %u) hasn't made progress in 30 seconds! Sending SIGKILL\n",
				child->num, pid);
		child->kill_count++;
		kill_pid(pid);
	}

	/* if we're still around after 40s, repeatedly send SIGKILLs every second. */
	if (diff < 40)
		return FALSE;

	debugf("sending another SIGKILL to child %d (pid %u). [kill count:%d] [diff:%d]\n",
		child->num, pid, child->kill_count, diff);
	child->kill_count++;
	kill_pid(pid);

	return FALSE;
}
示例#3
0
static void check_children(void)
{
	struct timeval tv;
	time_t diff;
	time_t old, now;
	unsigned int i;

	for_each_pidslot(i) {
		pid_t pid;

		pid = shm->pids[i];

		if (pid == EMPTY_PIDSLOT)
			continue;

		old = shm->tv[i].tv_sec;

		if (old == 0)
			continue;

		gettimeofday(&tv, NULL);
		now = tv.tv_sec;

		/* if we wrapped, just reset it, we'll pick it up next time around. */
		if (old > (now + 3)) {
			output(1, "child %u wrapped! old=%lu now=%lu\n", i, old, now);
			shm->tv[i].tv_sec = now;
			continue;
		}

		diff = now - old;

		/* if we're way off, we're comparing garbage. Reset it. */
		if (diff > 1000) {
			output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d.  Setting to now.\n", i, pid, old, now, diff);
			shm->tv[i].tv_sec = now;
			continue;
		}

		/* After 30 seconds of no progress, send a kill signal. */
		if (diff == 30) {
			stuck_syscall_info(i);
			output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n",
				pid, old, now, diff);
		}

		if (diff >= 30) {
			int ret;

			if (shm->kill_count[i] > 1) {
				output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n",
					pid, shm->kill_count[i], diff);
			} else {
				output(0, "sending SIGKILL to pid %d. [diff:%d]\n",
					pid, diff);
			}
			shm->kill_count[i]++;
			ret = kill(pid, SIGKILL);
			if (ret != 0) {
				output(0, "couldn't kill pid %d [%s]\n", pid, strerror(errno));
			}
			sleep(1);	// give child time to exit.
		}
	}
}