/* * Check that a child is making forward progress by comparing the timestamps it * recorded before making its last syscall. * If no progress is being made, send SIGKILLs to it. */ static void check_child_progress(struct childdata *child) { struct syscallrecord *rec; struct timeval tv; time_t diff, old, now; pid_t pid; pid = child->pid; if (pid == EMPTY_PIDSLOT) return; rec = &child->syscall; old = rec->tv.tv_sec; if (old == 0) return; gettimeofday(&tv, NULL); now = tv.tv_sec; if (old > now) diff = old - now; else diff = now - old; /* hopefully the common case. */ if (diff < 30) return; /* After 30 seconds of no progress, send a kill signal. */ if (diff == 30) { stuck_syscall_info(child); debugf("child %d (pid %u) hasn't made progress in 30 seconds! Sending SIGKILL\n", child->num, pid); child->kill_count++; kill_pid(pid); } /* if we're still around after 40s, repeatedly send SIGKILLs every second. */ if (diff < 40) return; debugf("sending another SIGKILL to child %d (pid %u). [kill count:%d] [diff:%d]\n", child->num, pid, child->kill_count, diff); child->kill_count++; kill_pid(pid); /* if we wrapped, just reset it, we'll pick it up next time around. */ if (diff > 2146) { /* max adjtime offset, + the 1 second since last time. */ output(1, "child %u wrapped! old=%lu now=%lu\n", child->num, old, now); rec->tv.tv_sec = now; return; } }
/* * Check that a child is making forward progress by comparing the timestamps it * recorded before making its last syscall. * If no progress is being made, send SIGKILLs to it. */ static bool is_child_making_progress(struct childdata *child) { struct syscallrecord *rec; struct timespec tp; time_t diff, old, now; pid_t pid; pid = child->pid; if (pid == EMPTY_PIDSLOT) return TRUE; rec = &child->syscall; old = rec->tp.tv_sec; /* haven't done anything yet. */ if (old == 0) return TRUE; clock_gettime(CLOCK_MONOTONIC, &tp); now = tp.tv_sec; if (old > now) diff = old - now; else diff = now - old; /* hopefully the common case. */ if (diff < 30) return TRUE; /* After 30 seconds of no progress, send a kill signal. */ if (diff == 30) { stuck_syscall_info(child); debugf("child %d (pid %u) hasn't made progress in 30 seconds! Sending SIGKILL\n", child->num, pid); child->kill_count++; kill_pid(pid); } /* if we're still around after 40s, repeatedly send SIGKILLs every second. */ if (diff < 40) return FALSE; debugf("sending another SIGKILL to child %d (pid %u). [kill count:%d] [diff:%d]\n", child->num, pid, child->kill_count, diff); child->kill_count++; kill_pid(pid); return FALSE; }
static void check_children(void) { struct timeval tv; time_t diff; time_t old, now; unsigned int i; for_each_pidslot(i) { pid_t pid; pid = shm->pids[i]; if (pid == EMPTY_PIDSLOT) continue; old = shm->tv[i].tv_sec; if (old == 0) continue; gettimeofday(&tv, NULL); now = tv.tv_sec; /* if we wrapped, just reset it, we'll pick it up next time around. */ if (old > (now + 3)) { output(1, "child %u wrapped! old=%lu now=%lu\n", i, old, now); shm->tv[i].tv_sec = now; continue; } diff = now - old; /* if we're way off, we're comparing garbage. Reset it. */ if (diff > 1000) { output(0, "huge delta! pid slot %d [%d]: old:%ld now:%ld diff:%d. Setting to now.\n", i, pid, old, now, diff); shm->tv[i].tv_sec = now; continue; } /* After 30 seconds of no progress, send a kill signal. */ if (diff == 30) { stuck_syscall_info(i); output(0, "pid %d hasn't made progress in 30 seconds! (last:%ld now:%ld diff:%d)\n", pid, old, now, diff); } if (diff >= 30) { int ret; if (shm->kill_count[i] > 1) { output(0, "sending another SIGKILL to pid %d. [kill count:%d] [diff:%d]\n", pid, shm->kill_count[i], diff); } else { output(0, "sending SIGKILL to pid %d. [diff:%d]\n", pid, diff); } shm->kill_count[i]++; ret = kill(pid, SIGKILL); if (ret != 0) { output(0, "couldn't kill pid %d [%s]\n", pid, strerror(errno)); } sleep(1); // give child time to exit. } } }