Esempio n. 1
0
/**
 * For each millisecond that has passed since this function was last called:
 *   Scan the job buffer and increment the internalTicksCounter 
 *      with 1ms to keep track of where we are
 */
inline
void 
ThreadConfig::scanTimeQueue()
{
  unsigned int maxCounter = 0;
  const NDB_TICKS currTicks = NdbTick_getCurrentTicks();

  if (NdbTick_Compare(currTicks, globalData.internalTicksCounter) < 0) {
//--------------------------------------------------------------------
// This could occur around 2036 or if the operator decides to change
// time backwards. We cannot know how long time has past since last
// time and we make a best try with 0 milliseconds.
//--------------------------------------------------------------------
    const Uint64 backward = 
      NdbTick_Elapsed(currTicks, globalData.internalTicksCounter).milliSec();

    // Silently ignore sub millisecond backticks.
    // Such 'noise' is unfortunately common even for monotonic timers.
    if (backward > 0)
    {
      g_eventLogger->warning("Time moved backwards with %llu ms", backward);
      globalData.internalTicksCounter = currTicks;
      assert(backward < 100 || !NdbTick_IsMonotonic()); 
    }
    return;
  }//if

  Uint64 elapsed = 
    NdbTick_Elapsed(globalData.internalTicksCounter,currTicks).milliSec();
  if (elapsed > 1500) {
//--------------------------------------------------------------------
// Time has moved forward more than a second. Either it could happen
// if operator changed the time or if the OS has misbehaved badly.
// We set the new time to one second from the past.
//--------------------------------------------------------------------
    g_eventLogger->warning("Time moved forward with %llu ms", elapsed);
    elapsed -= 1000;
    globalData.internalTicksCounter = 
      NdbTick_AddMilliseconds(globalData.internalTicksCounter,elapsed);
  }//if
  while ((elapsed > 0) &&
         (maxCounter < 20)){
    globalData.internalTicksCounter = 
      NdbTick_AddMilliseconds(globalData.internalTicksCounter,1);
    elapsed--;
    maxCounter++;
    globalTimeQueue.scanTable();
  }//while
}//ThreadConfig::scanTimeQueue()
Esempio n. 2
0
void 
WatchDog::run()
{
  unsigned int sleep_time;
  NDB_TICKS last_ticks, now;
  Uint32 numThreads;
  Uint32 counterValue[MAX_WATCHED_THREADS];
  Uint32 oldCounterValue[MAX_WATCHED_THREADS];
  Uint32 threadId[MAX_WATCHED_THREADS];
  NDB_TICKS start_ticks[MAX_WATCHED_THREADS];
  Uint32 theIntervalCheck[MAX_WATCHED_THREADS];
  Uint32 elapsed[MAX_WATCHED_THREADS];

  if (!NdbTick_IsMonotonic())
  {
    g_eventLogger->warning("A monotonic timer was not available on this platform.");
    g_eventLogger->warning("Adjusting system time manually, or otherwise (e.g. NTP), "
              "may cause false watchdog alarms, temporary freeze, or node shutdown.");
  }

  last_ticks = NdbTick_getCurrentTicks();

  while (!theStop)
  {
    sleep_time= 100;

    NdbSleep_MilliSleep(sleep_time);
    if(theStop)
      break;

    now = NdbTick_getCurrentTicks();

    if (NdbTick_Compare(now, last_ticks) < 0)
    {
      g_eventLogger->warning("Watchdog: Time ticked backwards %llu ms.",
                             NdbTick_Elapsed(now, last_ticks).milliSec());
      /**
       * A backtick after sleeping 100ms, is considdered a
       * fatal error if monotonic timers are used.
       */
      assert(!NdbTick_IsMonotonic());
    }
    // Print warnings if sleeping much longer than expected
    else if (NdbTick_Elapsed(last_ticks, now).milliSec() > sleep_time*2)
    {
      struct tms my_tms;
      if (times(&my_tms) != (clock_t)-1)
      {
        g_eventLogger->info("Watchdog: User time: %llu  System time: %llu",
                          (Uint64)my_tms.tms_utime,
                          (Uint64)my_tms.tms_stime);
      }
      else
      {
        g_eventLogger->info("Watchdog: User time: %llu System time: %llu (errno=%d)",
                          (Uint64)my_tms.tms_utime,
                          (Uint64)my_tms.tms_stime,
                          errno);
      }
      g_eventLogger->warning("Watchdog: Warning overslept %llu ms, expected %u ms.",
                             NdbTick_Elapsed(last_ticks, now).milliSec(),
                             sleep_time);
    }
    last_ticks = now;

    /*
      Copy out all active counters under locked mutex, then check them
      afterwards without holding the mutex.
    */
    NdbMutex_Lock(m_mutex);
    numThreads = m_watchedCount;
    for (Uint32 i = 0; i < numThreads; i++)
    {
#ifdef NDB_HAVE_XCNG
      /* atomically read and clear watchdog counter */
      counterValue[i] = xcng(m_watchedList[i].m_watchCounter, 0);
#else
      counterValue[i] = *(m_watchedList[i].m_watchCounter);
#endif
      if (likely(counterValue[i] != 0))
      {
        /*
          The thread responded since last check, so just update state until
          next check.
         */
#ifndef NDB_HAVE_XCNG
        /*
          There is a small race here. If the thread changes the counter
          in-between the read and setting to zero here in the watchdog
          thread, then gets stuck immediately after, we may report the
          wrong action that it got stuck on.
          But there will be no reporting of non-stuck thread because of
          this race, nor will there be missed reporting.
        */
        *(m_watchedList[i].m_watchCounter) = 0;
#endif
        m_watchedList[i].m_startTicks = now;
        m_watchedList[i].m_slowWarnDelay = theInterval;
        m_watchedList[i].m_lastCounterValue = counterValue[i];
      }
      else
      {
        start_ticks[i] = m_watchedList[i].m_startTicks;
        threadId[i] = m_watchedList[i].m_threadId;
        oldCounterValue[i] = m_watchedList[i].m_lastCounterValue;
        theIntervalCheck[i] = m_watchedList[i].m_slowWarnDelay;
        elapsed[i] = (Uint32)NdbTick_Elapsed(start_ticks[i], now).milliSec();
        if (oldCounterValue[i] == 9 && elapsed[i] >= theIntervalCheck[i])
          m_watchedList[i].m_slowWarnDelay += theInterval;
      }
    }
    NdbMutex_Unlock(m_mutex);

    /*
      Now check each watched thread if it has reported progress since previous
      check. Warn about any stuck threads, and eventually force shutdown the
      server.
    */
    for (Uint32 i = 0; i < numThreads; i++)
    {
      if (counterValue[i] != 0)
        continue;

      /*
        Counter value == 9 indicates malloc going on, this can take some time
        so only warn if we pass the watchdog interval
      */
      if (oldCounterValue[i] != 9 || elapsed[i] >= theIntervalCheck[i])
      {
        const char *last_stuck_action = get_action(oldCounterValue[i]);
        if (last_stuck_action != NULL)
        {
          g_eventLogger->warning("Ndb kernel thread %u is stuck in: %s "
                                 "elapsed=%u",
                                 threadId[i], last_stuck_action, elapsed[i]);
        }
        else
        {
          g_eventLogger->warning("Ndb kernel thread %u is stuck in: Unknown place %u "
                                 "elapsed=%u",
                                 threadId[i],  oldCounterValue[i], elapsed[i]);
        }
        {
          struct tms my_tms;
          if (times(&my_tms) != (clock_t)-1)
          {
            g_eventLogger->info("Watchdog: User time: %llu  System time: %llu",
                              (Uint64)my_tms.tms_utime,
                              (Uint64)my_tms.tms_stime);
          }
          else
          {
            g_eventLogger->info("Watchdog: User time: %llu System time: %llu (errno=%d)",
                              (Uint64)my_tms.tms_utime,
                              (Uint64)my_tms.tms_stime,
                              errno);
          }
        }
        if (elapsed[i] > 3 * theInterval)
        {
          shutdownSystem(last_stuck_action);
        }
      }
    }
  }
  return;
}