Esempio n. 1
0
static
inline
void
lock(struct thr_spin_lock* sl)
{
  unsigned int lockval = LOCK_VAL;
  volatile unsigned* val = &sl->m_lock;
test:
  if (likely(xcng(val, lockval) == UNLOCK_VAL))
    return;

  lock_slow(sl);
}
Esempio n. 2
0
static
void
lock_slow(struct thr_spin_lock* sl)
{
  unsigned int lockval = LOCK_VAL;
  volatile unsigned* val = &sl->m_lock;
test:
  do {
    cpu_pause();
  } while (* val == lockval);
  
  if (likely(xcng(val, lockval) == UNLOCK_VAL))
    return;
  
  goto test;
}
Esempio n. 3
0
void 
WatchDog::run()
{
  unsigned int sleep_time;
  NDB_TICKS last_ticks, now;
  Uint32 numThreads;
  Uint32 counterValue[MAX_WATCHED_THREADS];
  Uint32 oldCounterValue[MAX_WATCHED_THREADS];
  Uint32 threadId[MAX_WATCHED_THREADS];
  NDB_TICKS start_ticks[MAX_WATCHED_THREADS];
  Uint32 theIntervalCheck[MAX_WATCHED_THREADS];
  Uint32 elapsed[MAX_WATCHED_THREADS];

  if (!NdbTick_IsMonotonic())
  {
    g_eventLogger->warning("A monotonic timer was not available on this platform.");
    g_eventLogger->warning("Adjusting system time manually, or otherwise (e.g. NTP), "
              "may cause false watchdog alarms, temporary freeze, or node shutdown.");
  }

  last_ticks = NdbTick_getCurrentTicks();

  while (!theStop)
  {
    sleep_time= 100;

    NdbSleep_MilliSleep(sleep_time);
    if(theStop)
      break;

    now = NdbTick_getCurrentTicks();

    if (NdbTick_Compare(now, last_ticks) < 0)
    {
      g_eventLogger->warning("Watchdog: Time ticked backwards %llu ms.",
                             NdbTick_Elapsed(now, last_ticks).milliSec());
      /**
       * A backtick after sleeping 100ms, is considdered a
       * fatal error if monotonic timers are used.
       */
      assert(!NdbTick_IsMonotonic());
    }
    // Print warnings if sleeping much longer than expected
    else if (NdbTick_Elapsed(last_ticks, now).milliSec() > sleep_time*2)
    {
      struct tms my_tms;
      if (times(&my_tms) != (clock_t)-1)
      {
        g_eventLogger->info("Watchdog: User time: %llu  System time: %llu",
                          (Uint64)my_tms.tms_utime,
                          (Uint64)my_tms.tms_stime);
      }
      else
      {
        g_eventLogger->info("Watchdog: User time: %llu System time: %llu (errno=%d)",
                          (Uint64)my_tms.tms_utime,
                          (Uint64)my_tms.tms_stime,
                          errno);
      }
      g_eventLogger->warning("Watchdog: Warning overslept %llu ms, expected %u ms.",
                             NdbTick_Elapsed(last_ticks, now).milliSec(),
                             sleep_time);
    }
    last_ticks = now;

    /*
      Copy out all active counters under locked mutex, then check them
      afterwards without holding the mutex.
    */
    NdbMutex_Lock(m_mutex);
    numThreads = m_watchedCount;
    for (Uint32 i = 0; i < numThreads; i++)
    {
#ifdef NDB_HAVE_XCNG
      /* atomically read and clear watchdog counter */
      counterValue[i] = xcng(m_watchedList[i].m_watchCounter, 0);
#else
      counterValue[i] = *(m_watchedList[i].m_watchCounter);
#endif
      if (likely(counterValue[i] != 0))
      {
        /*
          The thread responded since last check, so just update state until
          next check.
         */
#ifndef NDB_HAVE_XCNG
        /*
          There is a small race here. If the thread changes the counter
          in-between the read and setting to zero here in the watchdog
          thread, then gets stuck immediately after, we may report the
          wrong action that it got stuck on.
          But there will be no reporting of non-stuck thread because of
          this race, nor will there be missed reporting.
        */
        *(m_watchedList[i].m_watchCounter) = 0;
#endif
        m_watchedList[i].m_startTicks = now;
        m_watchedList[i].m_slowWarnDelay = theInterval;
        m_watchedList[i].m_lastCounterValue = counterValue[i];
      }
      else
      {
        start_ticks[i] = m_watchedList[i].m_startTicks;
        threadId[i] = m_watchedList[i].m_threadId;
        oldCounterValue[i] = m_watchedList[i].m_lastCounterValue;
        theIntervalCheck[i] = m_watchedList[i].m_slowWarnDelay;
        elapsed[i] = (Uint32)NdbTick_Elapsed(start_ticks[i], now).milliSec();
        if (oldCounterValue[i] == 9 && elapsed[i] >= theIntervalCheck[i])
          m_watchedList[i].m_slowWarnDelay += theInterval;
      }
    }
    NdbMutex_Unlock(m_mutex);

    /*
      Now check each watched thread if it has reported progress since previous
      check. Warn about any stuck threads, and eventually force shutdown the
      server.
    */
    for (Uint32 i = 0; i < numThreads; i++)
    {
      if (counterValue[i] != 0)
        continue;

      /*
        Counter value == 9 indicates malloc going on, this can take some time
        so only warn if we pass the watchdog interval
      */
      if (oldCounterValue[i] != 9 || elapsed[i] >= theIntervalCheck[i])
      {
        const char *last_stuck_action = get_action(oldCounterValue[i]);
        if (last_stuck_action != NULL)
        {
          g_eventLogger->warning("Ndb kernel thread %u is stuck in: %s "
                                 "elapsed=%u",
                                 threadId[i], last_stuck_action, elapsed[i]);
        }
        else
        {
          g_eventLogger->warning("Ndb kernel thread %u is stuck in: Unknown place %u "
                                 "elapsed=%u",
                                 threadId[i],  oldCounterValue[i], elapsed[i]);
        }
        {
          struct tms my_tms;
          if (times(&my_tms) != (clock_t)-1)
          {
            g_eventLogger->info("Watchdog: User time: %llu  System time: %llu",
                              (Uint64)my_tms.tms_utime,
                              (Uint64)my_tms.tms_stime);
          }
          else
          {
            g_eventLogger->info("Watchdog: User time: %llu System time: %llu (errno=%d)",
                              (Uint64)my_tms.tms_utime,
                              (Uint64)my_tms.tms_stime,
                              errno);
          }
        }
        if (elapsed[i] > 3 * theInterval)
        {
          shutdownSystem(last_stuck_action);
        }
      }
    }
  }
  return;
}