Esempio n. 1
0
/****** uti/sge_log/sge_do_log() ***********************************************
*  NAME
*     sge_do_log() -- Write message to log file 
*
*  SYNOPSIS
*     static void sge_do_log(int aLevel, const char *aMessage, const char 
*
*  FUNCTION
*     ??? 
*
*  INPUTS
*     int aLevel           - log level
*     const char *aMessage - log message
*
*  RESULT
*     void - none
*
*  NOTES
*     MT-NOTE: sge_do_log() is MT safe.
*
*******************************************************************************/
static void sge_do_log(u_long32 me, const char* progname, const char* unqualified_hostname,
                       int aLevel, const char *aMessage)
{
   int fd;

   if (me == QMASTER || me == EXECD || me == SCHEDD || me == SHADOWD) {
      if ((fd = SGE_OPEN3(log_state_get_log_file(), O_WRONLY | O_APPEND | O_CREAT, 0666)) >= 0) {
         char msg2log[4*MAX_STRING_SIZE];
         dstring msg;
         int len;

         sge_dstring_init(&msg, msg2log, sizeof(msg2log));

         append_time((time_t)sge_get_gmt(), &msg, false);

         sge_dstring_sprintf_append(&msg, "|%6.6s|%s|%c|%s\n",
                 progname,
                 unqualified_hostname,
                 aLevel,
                 aMessage);

         len = strlen(msg2log);
         if (write(fd, msg2log, len) != len) {
            /* we are in error logging here - the only chance to log this problem
             * might be to write it to stderr
             */
            fprintf(stderr, "can't log to file %s: %s\n", log_state_get_log_file(), sge_strerror(errno, &msg));
         }
         close(fd);
      }
   }

   return;
} /* sge_do_log() */
Esempio n. 2
0
/****** uti/sge_log/sge_do_log() ***********************************************
*  NAME
*     sge_do_log() -- Write message to log file 
*
*  SYNOPSIS
*     static void sge_do_log(int aLevel, const char *aMessage, const char 
*
*  FUNCTION
*     ??? 
*
*  INPUTS
*     int aLevel           - log level
*     const char *aMessage - log message
*
*  RESULT
*     void - none
*
*  NOTES
*     MT-NOTE: sge_do_log() is MT safe.
*
*******************************************************************************/
static void sge_do_log(u_long32 me, const char* progname, const char* unqualified_hostname,
                       int aLevel, const char *aMessage) 
{
   int fd;

   if (me == QMASTER || me == EXECD || me == SCHEDD || me == SHADOWD) {
      if ((fd = SGE_OPEN3(log_state_get_log_file(), O_WRONLY | O_APPEND | O_CREAT, 0666)) >= 0) {
         char msg2log[4*MAX_STRING_SIZE];
         dstring msg;
         
         sge_dstring_init(&msg, msg2log, sizeof(msg2log));

         append_time((time_t)sge_get_gmt(), &msg, false); 

         sge_dstring_sprintf_append(&msg, "|%6.6s|%s|%c|%s\n",
                 progname,
                 unqualified_hostname,
                 aLevel,
                 aMessage);

         write(fd, msg2log, strlen(msg2log));
         close(fd);
      }
   }   

   return;
} /* sge_do_log() */
Esempio n. 3
0
/****** Interactive/qrsh/write_pid_file() ***************************************
*
*  NAME
*     write_pid_file()  -- write a pid to file pid in $TMPDIR
*
*  SYNOPSIS
*     static int write_pid_file(pid_t pid);
*
*  FUNCTION
*     Writes the given pid to a file named pid in the directory
*     contained in environment variable TMPDIR
*
*  INPUTS
*     pid - the pid to write
*
*  RESULT
*     1, if all actions could be performed
*     0, if an error occured. Possible error situations are:
*        - the environement variable TMPDIR cannot be read
*        - the file cannot be opened
*
****************************************************************************
*/
static int write_pid_file(pid_t pid) 
{
   char *pid_file_name = NULL;
   int pid_file;
   char pid_str[20];

   if((pid_file_name = search_conf_val("qrsh_pid_file")) == NULL) {
      qrsh_error(MSG_CONF_NOCONFVALUE_S, "qrsh_pid_file");
      return 0;
   }

   if((pid_file = SGE_OPEN3(pid_file_name, O_WRONLY | O_APPEND | O_CREAT, 00744)) == -1) {
      dstring ds = DSTRING_INIT;
      qrsh_error(MSG_QRSH_STARTER_CANNOTWRITEPID_SS, pid_file_name, sge_strerror(errno, &ds));
      sge_dstring_free(&ds);
      return 0;
   }

   snprintf(pid_str, 20, pid_t_fmt, pid);

   if (write(pid_file, pid_str, strlen(pid_str)) != strlen(pid_str)) {
      dstring ds = DSTRING_INIT;
      qrsh_error(MSG_FILE_CANNOT_WRITE_SS, pid_file_name, sge_strerror(errno, &ds));
      sge_dstring_free(&ds);
   }

   SGE_CLOSE(pid_file);
   return 1;
}
Esempio n. 4
0
/****** Interactive/qrsh/qrsh_error() ******************************************
*  NAME
*     qrsh_error() -- propagate qrsh startup error to shepherd and qrsh client
*
*  SYNOPSIS
*     static 
*     void qrsh_error(const char *fmt, ...) 
*
*  FUNCTION
*     Writes the passed error message to a special error file in the jobs
*     temporary directory.
*     Separate error files are written for jobs and tasks (started by 
*     qrsh -inherit).
*
*  INPUTS
*     const char *fmt - format string
*     ...             - arguments to be formatted using the format string
*
*******************************************************************************/
static void qrsh_error(const char *fmt, ...)
{
   char *tmpdir = NULL;
   char *taskid = NULL;
   int file;
   char fileName[SGE_PATH_MAX];

   va_list ap;
   char message[MAX_STRING_SIZE];

   va_start(ap, fmt);

   if (fmt == NULL || *fmt == '\0') {
      return;
   }

   vsnprintf(message, MAX_STRING_SIZE, fmt, ap);
   va_end(ap);

   if ((tmpdir = search_conf_val("qrsh_tmpdir")) == NULL) {
      fprintf(stderr, "%s\n", message);
      fprintf(stderr, MSG_CONF_NOCONFVALUE_S, "qrsh_tmpdir");
      fprintf(stderr, "\n");
      return;
   }

   taskid = search_conf_val("qrsh_task_id");

   if (taskid != NULL) {
      snprintf(fileName, SGE_PATH_MAX, "%s/qrsh_error.%s", tmpdir, taskid);
   } else {
      snprintf(fileName, SGE_PATH_MAX, "%s/qrsh_error", tmpdir);
   }

   if ((file = SGE_OPEN3(fileName, O_WRONLY | O_APPEND | O_CREAT, 00744)) == -1) {
      fprintf(stderr, "%s\n", message);
      fprintf(stderr, MSG_QRSH_STARTER_CANNOTOPENFILE_SS, fileName, strerror(errno));
      fprintf(stderr, "\n");
      return;
   }

   if (write(file, message, strlen(message)) != strlen(message)) {
      dstring ds = DSTRING_INIT;
      fprintf(stderr, MSG_FILE_CANNOT_WRITE_SS, fileName, sge_strerror(errno, &ds));
      sge_dstring_free(&ds);
   }

   close(file);
}
Esempio n. 5
0
/****** Interactive/qrsh/writeExitCode() ***************************************
*
*  NAME
*    writeExitCode() -- write exit code of child process to file
*
*  SYNOPSIS
*     static int writeExitCode(int myExitCode, int programExitCode)
*
*  FUNCTION
*     If myExitCode != EXIT_SUCCESS, that means, if an error occured in
*     qrsh_starter, write this exit code to file,
*     else write the exit code of the child process (programExitCode).
*     The exit code is written to a file "qrsh_exit_code" in the
*     directory $TMPDIR.
*
*  INPUTS
*     myExitCode      - status of qrsh_starter
*     programExitCode - status of the child process
*
*  RESULT
*     EXIT_SUCCESS, if all actions could be performed,
*     EXIT_FAILURE, if one of the following errors occured:
*        - the environment variable TMPDIR cannot be read
*        - the file $TMPDIR/qrsh_exit_code cannot be written
*
****************************************************************************
*/
static int writeExitCode(int myExitCode, int programExitCode) 
{   
   int exitCode;
   char exitCode_str[20];
   char *tmpdir = NULL;
   char *taskid = NULL;
   int  file;
   char fileName[SGE_PATH_MAX];

   if(myExitCode != EXIT_SUCCESS) {
      exitCode = MAKEEXITSTATUS(myExitCode);
   } else {
      exitCode = programExitCode;
   }

   if((tmpdir = search_conf_val("qrsh_tmpdir")) == NULL) {
      qrsh_error(MSG_CONF_NOCONFVALUE_S, "qrsh_tmpdir");
      return EXIT_FAILURE;
   }
  
   taskid = get_conf_val("pe_task_id");
   
   if(taskid != NULL) {
      snprintf(fileName, SGE_PATH_MAX, "%s/qrsh_exit_code.%s", tmpdir, taskid);
   } else {
      snprintf(fileName, SGE_PATH_MAX, "%s/qrsh_exit_code", tmpdir);
   }

   if((file = SGE_OPEN3(fileName, O_WRONLY | O_APPEND | O_CREAT, 00744)) == -1) {
      dstring ds = DSTRING_INIT;
      qrsh_error(MSG_QRSH_STARTER_CANNOTOPENFILE_SS, fileName, sge_strerror(errno, &ds));
      sge_dstring_free(&ds);
      return EXIT_FAILURE;
   }
 
   snprintf(exitCode_str, 20, "%d", exitCode);
   if (write(file, exitCode_str, strlen(exitCode_str)) != strlen(exitCode_str)) {
      dstring ds = DSTRING_INIT;
      qrsh_error(MSG_FILE_CANNOT_WRITE_SS, fileName, sge_strerror(errno, &ds));
      sge_dstring_free(&ds);
   }
   SGE_CLOSE(file);
   
   return EXIT_SUCCESS;
}
Esempio n. 6
0
/****** uti/io/sge_copy_append() **********************************************
*  NAME
*     sge_copy_append() -- Copy/append one file to another 
*
*  SYNOPSIS
*     int sge_copy_append(char *src, const char *dst, 
*                         sge_mode_t mode) 
*
*  FUNCTION
*     Copy/append content from 'src' to 'dst' 
*
*  INPUTS
*     char *src       - source filename 
*     const char *dst - destination filename 
*     sge_mode_t mode - mode 
*
*  RESULT
*     int - error state
*         0 - OK
*        -1 - Error
*
*  SEE ALSO
*     uti/io/sge_mode_t
*
*  NOTES
*     MT-NOTE: sge_copy_append() is MT safe
******************************************************************************/
int sge_copy_append(char *src, const char *dst, sge_mode_t mode)
{
#define CPBUF 1024

   char buf[CPBUF];
   int fdsrc, fddst, modus, rs, ws;
   bool error;

   DENTER(TOP_LAYER, "sge_copy_append");
  
   if (src == NULL || dst == NULL || strlen(src) == 0 || strlen(dst) == 0 ||
      !(mode == SGE_MODE_APPEND || mode == SGE_MODE_COPY)) {
      DEXIT;
      return -1;   
   }
   if (!strcmp(src, dst)) {
      DEXIT;
      return -1;
   }   
 
   /* Return if source file doesn't exist */
   if ((fdsrc = SGE_OPEN2(src, O_RDONLY)) == -1) {
      DEXIT;
      return -1;
   }   
     
   if (mode == SGE_MODE_APPEND)
      modus = O_WRONLY | O_APPEND | O_CREAT;
   else
      modus = O_WRONLY | O_CREAT;      
    
   if ((fddst = SGE_OPEN3(dst, modus, 0666)) == -1) {
      DEXIT;
      return -1;
   }    
    
   error = false;
   while (!error) {
      rs = read(fdsrc, buf, 512);
      if (rs == -1 && errno == EINTR)
         continue;
      else if (rs == -1)
         error = true;
    
      if (!error && rs > 0) {      
         while (!error) {   
            ws = write(fddst, buf, rs);
            if (ws == -1 && errno == EINTR)   
               continue;
            else if (ws == -1) {
               error = true;
               break;
            } 
            else
               break;
         }
      }
      if (error)
         break;
      if (rs == 0)
         break;   
   }           
    
   close(fdsrc);
   close(fddst);
 
   DEXIT;
   return (error ? -1: 0);
}
Esempio n. 7
0
/****** err_trace/shepherd_trace_init_intern() *******************************
*  NAME
*     shepherd_trace_init_intern() -- Initialize shepherd's tracing. 
*
*  SYNOPSIS
*     static FILE* shepherd_trace_init(char *trace_file_path,
*													char *trace_file_name)
*
*  FUNCTION
*     Opens the shepherd's trace file and sets the FD_CLOEXEC-flag so it will
*     be closed automatically in an exec()-call.
*     Must be called with euid=admin user to work properly!
*
*  INPUTS
*     char *trace_file_path - either the whole path of the trace file (including
*                             the file itself)
*                             or NULL to retrieve the file pointer of an already
*                             opened trace file.
*     char *trace_file_name - the name of the trace file itself. Ignored when
*                             *trace_file_path is NULL.
* 
*  RESULT
*     FILE* - If successfully opened, the file pointer of shepherd's trace file.
*           - Otherwise NULL.
*******************************************************************************/
static FILE* shepherd_trace_init_intern(st_shepherd_file_t shepherd_file)
{
   static char     path[SGE_PATH_MAX];
   static bool     called = false;
   SGE_STRUCT_STAT statbuf;
   dstring         ds;
   char            buffer[SGE_PATH_MAX+128];
   char            tmppath[SGE_PATH_MAX];
   int             fd       = -1;
   FILE            *fp      = NULL;
   int             do_chown = 0;

  	/* 
  	 *  after changing into the jobs cwd we need an 
  	 *  absolute path to the error/trace file 
  	 */
	if (called == false) { 
      getcwd(path, sizeof(path)); 
		called=true;
	}

  	snprintf(tmppath, SGE_PATH_MAX,"%s/%s",path, g_shepherd_file_name[shepherd_file]);
   sge_strlcpy(g_shepherd_file_path[shepherd_file], tmppath, SGE_PATH_MAX);

	/* If the file does not exist, create it. Otherwise just open it. */
	if (SGE_STAT(tmppath, &statbuf)) {
	  fd = SGE_OPEN3(tmppath, O_RDWR | O_CREAT | O_APPEND, 0644);
      if (fd<0) {
         sge_dstring_init(&ds, buffer, sizeof(buffer));
         sge_dstring_sprintf(&ds, "creat(%s) failed: %s", tmppath, strerror(errno));
         shepherd_panic(buffer);
      }

      if (getuid() == SGE_SUPERUSER_UID) {
         /* We must give the file to the job owner later */
			do_chown = 1;
		} else {
         /* We are not root, so we have to own all files anyway. */
         do_chown = 0;
		}
	} else {
      /* The file already exists. We get here when
       * a) a exec() failed or
       * b) after the execution of prolog/job, when the job/epilog
       * tries to init the error/exit status files.
       *
       * In a root system we can just open the file, because we are either
       * root or the job user who owns the file.
       * In a admin user system we must set our euid to root to open it, then
       * it is the same as the root system.
       * In a test user system we are the owner of the file and can open it.
       *
       * When we are root (masked or not), we gave this file to the
       * prolog user/job user right after its creation. But we can have
       * 3 different users for prolog, job and epilog, so we must give
       * the file here to the next user.
       * This must be done via shepherd_trace_chown() in the shepherd
       * before we switch to this user there.
       * It can't be done here because we don't know if we are in 
       * case a) (exec failed) or case b) (after execution of prolog/job).
       */
      int  old_euid = SGE_SUPERUSER_UID;

      /*
       * Work around for CR 6293411:
       * See shepherd_trace_exit() for details.
       */
      if (getuid() == SGE_SUPERUSER_UID) {
         old_euid = geteuid();
         seteuid(SGE_SUPERUSER_UID);
      }

      fd = SGE_OPEN2(tmppath, O_RDWR | O_APPEND);
      if (fd<0) {
         sge_dstring_init(&ds, buffer, sizeof(buffer));
         sge_dstring_sprintf(&ds, "open(%s) failed: %s",
                             tmppath, strerror(errno));
         shepherd_panic(buffer);
      }
      do_chown = 0;

      /*
       * Switch back to admin user?
       */
      if (old_euid != SGE_SUPERUSER_UID) {
         seteuid(old_euid);
      }
	}

	/* Something went wrong. */
	if (fd<0) {
		return NULL;
	}

	/* To avoid to block stdin, stdout or stderr, dup the fd until it is >= 3 */
	if (fd<3) {
		dup_fd(&fd);
	}

	/* Set FD_CLOEXEC flag to automatically close the file in an exec() */
	if (!set_cloexec(fd)) {
      shepherd_panic("set_cloexec() failed");
		return NULL;
	}

   /*
	 * Now open a FILE* from the file descriptor, so we can use fprintf().
    */
	fp = fdopen(fd, "a");
   if (!fp) {
      sge_dstring_init(&ds, buffer, sizeof(buffer));
      sge_dstring_sprintf(&ds, "can't open %s file \"%s\": %s\n",
				              g_shepherd_file_name[shepherd_file], tmppath, 
                          strerror(errno));
      shepherd_panic(buffer);
      return NULL;
   }
	if (do_chown && strlen(g_job_owner) > 0) {
		shepherd_trace_chown_intern(g_job_owner, fp, shepherd_file);
	}
	return fp;
}
Esempio n. 8
0
/*----------------------------------------------------------------------------*/
int 
main(int argc, char **argv)
{
   int heartbeat        = 0;
   int last_heartbeat   = 0;
   int latest_heartbeat = 0;
   int ret              = 0;
   int delay            = 0;
   time_t now, last;
/*    const char *cp; */
   char err_str[MAX_STRING_SIZE];
   char shadowd_pidfile[SGE_PATH_MAX];
   dstring ds;
   char buffer[256];
   pid_t shadowd_pid;

#if 1

static int check_interval = CHECK_INTERVAL;
static int get_active_interval = GET_ACTIVE_INTERVAL;
static int delay_time = DELAY_TIME;
static int sge_test_heartbeat = 0;

char binpath[SGE_PATH_MAX];
char oldqmaster[SGE_PATH_MAX];

char shadow_err_file[SGE_PATH_MAX];
char qmaster_out_file[SGE_PATH_MAX];

#endif

   lList *alp = NULL;
   sge_gdi_ctx_class_t *ctx = NULL;

   DENTER_MAIN(TOP_LAYER, "sge_shadowd");
   
   sge_dstring_init(&ds, buffer, sizeof(buffer));
   /* initialize recovery control variables */
   {
      char *s;
      int val;
      if ((s=getenv("SGE_CHECK_INTERVAL")) &&
          sscanf(s, "%d", &val) == 1)
         check_interval = val;
      if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) &&
          sscanf(s, "%d", &val) == 1)
         get_active_interval = val;
      if ((s=getenv("SGE_DELAY_TIME")) &&
          sscanf(s, "%d", &val) == 1)
         delay_time = val;
      if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) &&
          sscanf(s, "%d", &val) == 1)
         sge_test_heartbeat = val;
   }
         
   /* This needs a better solution */
   umask(022);

#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   /* init language output for gettext() , it will use the right language */
   sge_init_language_func((gettext_func_type)        gettext,
                         (setlocale_func_type)      setlocale,
                         (bindtextdomain_func_type) bindtextdomain,
                         (textdomain_func_type)     textdomain);
   sge_init_language(NULL,NULL);   
#endif /* __SGE_COMPILE_WITH_GETTEXT__  */

   log_state_set_log_file(TMP_ERR_FILE_SHADOWD);

   if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) {
      answer_list_output(&alp);
      SGE_EXIT((void**)&ctx, 1);
   }

   /* AA: TODO: change this */
   ctx->set_exit_func(ctx, shadowd_exit_func);
   sge_setup_sig_handlers(SHADOWD);
   
#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   if (ctx->get_qmaster_spool_dir(ctx) != NULL) {
      char *shadowd_name = SGE_SHADOWD;

      /* is there a running shadowd on this host (with unqualified name) */
      sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), 
              ctx->get_unqualified_hostname(ctx));

      DPRINTF(("pidfilename: %s\n", shadowd_pidfile));
      if ((shadowd_pid = sge_readpid(shadowd_pidfile))) {
         DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid)));
         if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) {
            CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid));
            SGE_EXIT((void**)&ctx, 1);
         }
      }

      ctx->prepare_enroll(ctx);

      /* is there a running shadowd on this host (with aliased name) */
      sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), 
              ctx->get_qualified_hostname(ctx));
      DPRINTF(("pidfilename: %s\n", shadowd_pidfile));
      if ((shadowd_pid = sge_readpid(shadowd_pidfile))) {
         DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid)));
         if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) {
            CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid));
            SGE_EXIT((void**)&ctx, 1);
         }
      }  
   } else {
      ctx->prepare_enroll(ctx);
   }

   if (parse_cmdline_shadowd(argc, argv) == 1) {
      SGE_EXIT((void**)&ctx, 0);
   }
   
   if (ctx->get_qmaster_spool_dir(ctx) == NULL) {
      CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx)));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (chdir(ctx->get_qmaster_spool_dir(ctx))) {
      CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx)));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) {
      CRITICAL((SGE_EVENT, SFNMAX, err_str));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (sge_switch2admin_user()) {
      CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER));
      SGE_EXIT((void**)&ctx, 1);
   }

   sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx));
   sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx));
   sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND);
   unlink(TMP_ERR_FILE_SHADOWD);
   log_state_set_log_as_admin_user(1);
   log_state_set_log_file(shadow_err_file);

   {
      int* tmp_fd_array = NULL;
      unsigned long tmp_fd_count = 0;

      if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) {
         sge_daemonize(tmp_fd_array, tmp_fd_count, ctx);
         if (tmp_fd_array != NULL) {
            sge_free(&tmp_fd_array);
         }
      } else {
         sge_daemonize(NULL, 0, ctx);
      }
   }

   /* shadowd pid file will contain aliased name */
   sge_write_pid(shadowd_pidfile);

   starting_up();
   
   sge_setup_sig_handlers(SHADOWD);

   last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30);

   last = (time_t) sge_get_gmt(); /* set time of last check time */

   delay = 0;
   while (!shut_me_down) {
      sleep(check_interval);

      /* get current heartbeat file content */
      heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30);

      now = (time_t) sge_get_gmt();


      /* Only check when we could read the heartbeat file at least two times
       * (last_heartbeat and heartbeat) without error 
       */
      if (last_heartbeat > 0 && heartbeat > 0) {

         /*
          * OK we have to heartbeat entries to check. Check times ...
          * now  = current time
          * last = last check time
          */
         if ( (now - last) >= (get_active_interval + delay) ) {

            delay = 0;
            if (last_heartbeat == heartbeat) {
               DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last)));
               delay = delay_time; /* set delay time */

               /*
                * check if we are a possible new qmaster host (lock file of qmaster active, etc.)
                */
               ret = check_if_valid_shadow(binpath, oldqmaster, 
                                           ctx->get_act_qmaster_file(ctx), 
                                           ctx->get_shadow_master_file(ctx), 
                                           ctx->get_qualified_hostname(ctx), 
                                           ctx->get_binary_path(ctx));

               if (ret == 0) {
                  /* we can start a qmaster on this host */
                  if (qmaster_lock(QMASTER_LOCK_FILE)) {
                     ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER));
                  } else {
                     int out, err;

                     /* still the old qmaster name in act_qmaster file and still the old heartbeat */
                     latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30);
                     /* TODO: what do we when there is a timeout ??? */
                     DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n"));
                     if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) &&
                         !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && 
                         (latest_heartbeat == heartbeat)) {
                        char qmaster_name[256];

                        strcpy(qmaster_name, SGE_PREFIX);
                        strcat(qmaster_name, prognames[QMASTER]); 
                        DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); 

                        /*
                         * open logfile as admin user for initial qmaster/schedd 
                         * startup messages
                         */
                        out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 
                                   0644);
                        err = out;
                        if (out == -1) {
                           /*
                            * First priority is the master restart
                            * => ignore this error
                            */
                           out = 1;
                           err = 2;
                        } 

                        sge_switch2start_user();
                        ret = startprog(out, err, NULL, binpath, qmaster_name, NULL);
                        sge_switch2admin_user();
                        if (ret) {
                           ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER));
                        }
                        close(out);
                     } else {
                        qmaster_unlock(QMASTER_LOCK_FILE);
                     }
                  }      
               } else {
                  if (ret == -1) {
                     /* just log the more important failures */    
                     WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) ));
                  }
               } 
            }
            /* Begin a new interval, set timers and hearbeat to current values */
            last = now;
            last_heartbeat = heartbeat;
         }
      } else {
         if (last_heartbeat < 0 || heartbeat < 0) {
            /* There was an error reading heartbeat or last_heartbeat */
            DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n",
                     sge_u32c(last_heartbeat), sge_u32c(heartbeat)));
         } else {
            DPRINTF(("have to read the heartbeat file twice to check time differences\n"));
         }
      }
   }

   sge_shutdown((void**)&ctx, 0);

   DRETURN(EXIT_SUCCESS);
}