void Scheduler::finishFakeLeave(ThreadInfo* th) { assert(th->fakeLeave); DEBUG_FL("%s (%d) @ 0x%lx finishFakeLeave()", GetSyscallName(th->fakeLeave->syscallNumber), th->fakeLeave->syscallNumber, th->fakeLeave->pc); assert_msg(th->state == RUNNING, "gid 0x%x invalid state %d", th->gid, th->state); FakeLeaveInfo* si = th->fakeLeave; fakeLeaves.remove(si); delete si; assert(th->fakeLeave == nullptr); }
static bool PostPatchTimeoutSyscall(uint32_t tid, CONTEXT* ctxt, SYSCALL_STANDARD std, int syscall, ADDRINT prevIp, ADDRINT timeoutArgVal) { assert(inFakeTimeoutMode[tid]); int res = (int)PIN_GetSyscallNumber(ctxt, std); // Decide if it timed out bool timedOut; if (syscall == SYS_futex) { timedOut = (res == -ETIMEDOUT); } else { timedOut = (res == 0); } bool isSleeping = zinfo->sched->isSleeping(procIdx, tid); // Decide whether to retry bool retrySyscall; if (!timedOut) { if (isSleeping) zinfo->sched->notifySleepEnd(procIdx, tid); retrySyscall = false; } else { retrySyscall = isSleeping; } if (retrySyscall && zinfo->procArray[procIdx]->isInFastForward()) { warn("[%d] Fast-forwarding started, not retrying timeout syscall (%s)", tid, GetSyscallName(syscall)); retrySyscall = false; assert(isSleeping); zinfo->sched->notifySleepEnd(procIdx, tid); } if (retrySyscall) { // ADDRINT curIp = PIN_GetContextReg(ctxt, REG_INST_PTR); //info("[%d] post-patch, retrying, IP: 0x%lx -> 0x%lx", tid, curIp, prevIp); PIN_SetContextReg(ctxt, REG_INST_PTR, prevIp); PIN_SetSyscallNumber(ctxt, std, syscall); } else { // Restore timeout arg PIN_SetSyscallArgument(ctxt, std, getTimeoutArg(syscall), timeoutArgVal); inFakeTimeoutMode[tid] = false; // Restore arg? I don't think we need this! /*if (syscall == SYS_futex) { PIN_SetSyscallNumber(ctxt, std, -ETIMEDOUT); } else { assert(syscall == SYS_epoll_wait || syscall == SYS_epoll_pwait || syscall == SYS_poll); PIN_SetSyscallNumber(ctxt, std, 0); //no events returned }*/ } //info("[%d] post-patch %s (%d), timedOut %d, sleeping (orig) %d, retrying %d, orig res %d, patched res %d", tid, GetSyscallName(syscall), syscall, timedOut, isSleeping, retrySyscall, res, (int)PIN_GetSyscallNumber(ctxt, std)); return retrySyscall; }
// Accurate join-leave implementation void Scheduler::syscallLeave(uint32_t pid, uint32_t tid, uint32_t cid, uint64_t pc, int syscallNumber, uint64_t arg0, uint64_t arg1) { futex_lock(&schedLock); uint32_t gid = getGid(pid, tid); ThreadInfo* th = contexts[cid].curThread; assert(th->gid == gid); assert_msg(th->cid == cid, "%d != %d", th->cid, cid); assert(th->state == RUNNING); assert_msg(pid < blockingSyscalls.size(), "%d >= %ld?", pid, blockingSyscalls.size()); bool blacklisted = blockingSyscalls[pid].find(pc) != blockingSyscalls[pid].end(); if (blacklisted || th->markedForSleep) { DEBUG_FL("%s @ 0x%lx calling leave(), reason: %s", GetSyscallName(syscallNumber), pc, blacklisted? "blacklist" : "sleep"); futex_unlock(&schedLock); leave(pid, tid, cid); } else { DEBUG_FL("%s @ 0x%lx skipping leave()", GetSyscallName(syscallNumber), pc); FakeLeaveInfo* si = new FakeLeaveInfo(pc, th, syscallNumber, arg0, arg1); fakeLeaves.push_back(si); // FIXME(dsm): zsim.cpp's SyscallEnter may be checking whether we are in a syscall and not calling us. // If that's the case, this would be stale, which may lead to some false positives/negatives futex_unlock(&schedLock); } }
void Scheduler::watchdogThreadFunc() { info("Started scheduler watchdog thread"); uint64_t lastPhase = 0; int multiplier = 1; uint64_t lastMs = 0; uint64_t fakeLeaveStalls = 0; while (true) { TrueSleep(multiplier*WATCHDOG_INTERVAL_USEC); if (zinfo->terminationConditionMet) { // Synchronize to avoid racing with EndOfPhaseActions code // (zinfo->terminationConditionMet is set on EndOfPhaseActions, // which has schedLock held, we must let it finish) futex_lock(&schedLock); info("Terminating scheduler watchdog thread"); futex_unlock(&schedLock); SimEnd(); } //Fastpath (unlocked, benign read races, only modifies local state) if (lastPhase != curPhase && pendingPidCleanups.size() == 0) { lastPhase = curPhase; fakeLeaveStalls = 0; if (multiplier < WATCHDOG_MAX_MULTIPLER) multiplier++; continue; } //if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) info("Mult %d curPhase %ld", multiplier, curPhase); futex_lock(&schedLock); if (lastPhase == curPhase && !fakeLeaves.empty() && (fakeLeaves.front()->th->futexJoin.action != FJA_WAKE)) { if (++fakeLeaveStalls >= WATCHDOG_STALL_THRESHOLD) { info("Detected possible stall due to fake leaves (%ld current)", fakeLeaves.size()); // Uncomment to print all leaves FakeLeaveInfo* pfl = fakeLeaves.front(); while (pfl) { info(" [%d/%d] %s (%d) @ 0x%lx", getPid(pfl->th->gid), getTid(pfl->th->gid), GetSyscallName(pfl->syscallNumber), pfl->syscallNumber, pfl->pc); pfl = pfl->next; } // Trigger a leave() on the first process, if the process's blacklist regex allows it FakeLeaveInfo* fl = fakeLeaves.front(); ThreadInfo* th = fl->th; uint32_t pid = getPid(th->gid); uint32_t tid = getTid(th->gid); uint32_t cid = th->cid; const g_string& sbRegexStr = zinfo->procArray[pid]->getSyscallBlacklistRegex(); std::regex sbRegex(sbRegexStr.c_str()); if (std::regex_match(GetSyscallName(fl->syscallNumber), sbRegex)) { // If this is the last leave we catch, it is the culprit for sure -> blacklist it // Over time, this will blacklist every blocking syscall // The root reason for being conservative though is that we don't have a sure-fire // way to distinguish IO waits from truly blocking syscalls (TODO) if (fakeLeaves.size() == 1) { info("Blacklisting from future fake leaves: [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx", pid, GetSyscallName(fl->syscallNumber), fl->pc, fl->arg0, fl->arg1); blockingSyscalls[pid].insert(fl->pc); } uint64_t pc = fl->pc; do { finishFakeLeave(th); futex_unlock(&schedLock); leave(pid, tid, cid); futex_lock(&schedLock); // also do real leave for other threads blocked at the same pc ... fl = fakeLeaves.front(); if (fl == nullptr || getPid(th->gid) != pid || fl->pc != pc) break; th = fl->th; tid = getTid(th->gid); cid = th->cid; // ... until a lower bound on queue size, in order to make blacklist work } while (fakeLeaves.size() > 8); } else { info("Skipping, [%d] %s @ 0x%lx | arg0 0x%lx arg1 0x%lx does not match blacklist regex (%s)", pid, GetSyscallName(fl->syscallNumber), fl->pc, fl->arg0, fl->arg1, sbRegexStr.c_str()); } fakeLeaveStalls = 0; } } else { fakeLeaveStalls = 0; } if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) { //info("Watchdog Thread: Sleep dep detected...") int64_t wakeupPhase = sleepQueue.front()->wakeupPhase; int64_t wakeupCycles = (wakeupPhase - curPhase)*zinfo->phaseLength; int64_t wakeupUsec = (wakeupCycles > 0)? wakeupCycles/zinfo->freqMHz : 0; //info("Additional usecs of sleep %ld", wakeupUsec); if (wakeupUsec > 10*1000*1000) warn("Watchdog sleeping for a long time due to long sleep, %ld secs", wakeupUsec/1000/1000); futex_unlock(&schedLock); TrueSleep(WATCHDOG_INTERVAL_USEC + wakeupUsec); futex_lock(&schedLock); if (lastPhase == curPhase && scheduledThreads == outQueue.size() && !sleepQueue.empty()) { ThreadInfo* sth = sleepQueue.front(); uint64_t curMs = curPhase*zinfo->phaseLength/zinfo->freqMHz/1000; uint64_t endMs = sth->wakeupPhase*zinfo->phaseLength/zinfo->freqMHz/1000; (void)curMs; (void)endMs; //make gcc happy if (curMs > lastMs + 1000) { info("Watchdog Thread: Driving time forward to avoid deadlock on sleep (%ld -> %ld ms)", curMs, endMs); lastMs += 1000; } while (sth->state == SLEEPING) { idlePhases.inc(); callback(); //sth will eventually get woken up if (futex_haswaiters(&schedLock)) { //happens commonly with multiple sleepers and very contended I/O... //info("Sched: Threads waiting on advance, startPhase %ld curPhase %ld", lastPhase, curPhase); break; } if (zinfo->terminationConditionMet) { info("Termination condition met inside watchdog thread loop, exiting"); break; } } idlePeriods.inc(); multiplier = 0; } } if (multiplier < WATCHDOG_MAX_MULTIPLER) { multiplier++; } lastPhase = curPhase; //Lazily clean state of processes that terminated abruptly //NOTE: For now, we rely on the process explicitly telling us that it's going to terminate. //We could make this self-checking by periodically checking for liveness of the processes we're supposedly running. //The bigger problem is that if we get SIGKILL'd, we may not even leave a consistent zsim state behind. while (pendingPidCleanups.size()) { std::pair<uint32_t, uint32_t> p = pendingPidCleanups.back(); uint32_t pid = p.first; //the procIdx pid uint32_t osPid = p.second; std::stringstream ss; ss << "/proc/" << osPid; struct stat dummy; if (stat(ss.str().c_str(), &dummy) == 0) { info("[watchdog] Deferring cleanup of pid %d (%d), not finished yet", pid, osPid); break; } pendingPidCleanups.pop_back(); //must happen while we have the lock futex_unlock(&schedLock); processCleanup(pid); futex_lock(&schedLock); } if (terminateWatchdogThread) { futex_unlock(&schedLock); break; } else { futex_unlock(&schedLock); } } info("Finished scheduler watchdog thread"); }