Пример #1
0
void sge_pset_create_processor_set(void) 
{
#if defined(__sgi) || defined(ALPHA) || defined(SOLARIS64) || defined(SOLARISAMD64)
   char err_str[2*SGE_PATH_MAX+128];

   /* SGI IRIX processor set stuff */
   if (strcasecmp("UNDEFINED",get_conf_val("processors"))) {
      int ret;

      sge_switch2start_user();
      if ((ret=set_processor_range(get_conf_val("processors"),
                 (int) strtol(get_conf_val("job_id"), NULL, 10),
                 err_str)) != PROC_SET_OK) {
         sge_switch2admin_user();
         if (ret == PROC_SET_WARNING) /* not critical - e.g. not root */
            shepherd_trace("warning: processor set not set in set_processor_range");
         else { /* critical --> use err_str to indicate error */
            shepherd_trace("critical error in set_processor_range - bailing out");
            shepherd_state = SSTATE_PROCSET_NOTSET;
            shepherd_error(1, err_str);
         }
      } else {
         sge_switch2admin_user();
      }
   }
#endif

}
Пример #2
0
/* Put the process PID into controller directory DIR.  */
static bool
set_pid_cgroup(pid_t pid, char *dir)
{
   char path[SGE_PATH_MAX], spid[PID_BSIZE];
   int ret;
   bool is_admin, error;

   DENTER(TOP_LAYER, "set_pid_cgroup");
   if (!pid) pid = getpid();
   snprintf(spid, sizeof spid, pid_t_fmt, pid);
   build_path(path, dir, "tasks");
   sge_running_as_admin_user(&error, &is_admin);
   if (error) {
      CRITICAL((SGE_EVENT, "Can't get admin user"));
      abort();
   }
   /* We can't move tasks unless we have permission to signal them,
      and we need sgeadmin permission for the filesystem.  */
   sge_seteuid(SGE_SUPERUSER_UID);
   errno = 0;
   ret = sge_string2file(spid, strlen(spid), path);
   if (ret != 0)
      WARNING((SGE_EVENT, "Can't put task in controller "SFN": "SFN,
               dir, strerror(errno)));
   if (is_admin)
      ret = sge_switch2admin_user();
   else
      ret = sge_switch2start_user();
   if (ret != 0) {
      CRITICAL((SGE_EVENT, "Can't switch user"));
      DEXIT;
      abort();
   }
   DRETURN(ret ? false : true);
}
Пример #3
0
int sge_remove_tmpdir(const char *dir, const char *job_owner, u_long32 jobid, u_long32 jataskid, const char *queue_name)
{
   stringT tmpstr;
   char err_str_buffer[1024];
   dstring err_str;

   DENTER(TOP_LAYER, "sge_remove_tmpdir");

   sge_dstring_init(&err_str, err_str_buffer, sizeof(err_str_buffer));

   if (!dir) {
      DRETURN(0);
   }

   sprintf(tmpstr, "%s/"sge_u32"."sge_u32".%s", dir, jobid, jataskid, queue_name);
   DPRINTF(("recursively unlinking \"%s\"\n", tmpstr));
   sge_switch2start_user();
   if (sge_rmdir(tmpstr, &err_str)) {
      ERROR((SGE_EVENT, MSG_FILE_RECURSIVERMDIR_SS, 
             tmpstr, err_str_buffer));
      sge_switch2admin_user();
      DRETURN(-1);
   }
   sge_switch2admin_user();

   DRETURN(0);
}
Пример #4
0
void sge_pset_free_processor_set(void)
{
#if defined(__sgi) || defined(ALPHA) || defined(SOLARIS64) || defined(SOLARISAMD64)
   /* SGI IRIX processor set stuff */
   if (strcasecmp("UNDEFINED",get_conf_val("processors"))) {
      char err_str[2*SGE_PATH_MAX+128];
      int ret;

      errno = 0;
      if (sge_switch2start_user()) {
         shepherd_trace("failed to switch user in free_processor_set: %s",
                        strerror(errno));
         shepherd_state = SSTATE_PROCSET_NOTFREED;
         shepherd_error(1, strerror(errno));
         return;
      }
      if ((ret=free_processor_set(err_str)) != PROC_SET_OK) {
         sge_switch2admin_user();
         switch (ret) {
         case PROC_SET_WARNING: /* not critical - e.g. not root */
            shepherd_trace("warning: processor set not freed in free_processor_set - "
                           "did no exist, probably");
            break;
         case PROC_SET_ERROR: /* critical - err_str indicates error */
            shepherd_trace("critical error in free_processor_set - bailing out");
            shepherd_state = SSTATE_PROCSET_NOTFREED;
            shepherd_error(1, err_str);
            break;
         case PROC_SET_BUSY: /* still processes running in processor set */
            shepherd_trace("error in releasing processor set");
            shepherd_state = SSTATE_PROCSET_NOTFREED;
            shepherd_error(1, err_str);
            break;
         default: /* should not occur */
            sprintf(err_str,
               "internal error after free_processor_set - ret=%d", ret);
            shepherd_state = SSTATE_PROCSET_NOTFREED;
            shepherd_error(1, err_str);
            break;
         }
      } else {
         sge_switch2admin_user();
      }
   }
#endif
}
Пример #5
0
char *sge_make_tmpdir(lListElem *qep, u_long32 jobid, u_long32 jataskid, uid_t uid, gid_t gid, char *tmpdir)
{
   const char *t;

   DENTER(TOP_LAYER, "sge_make_tmpdir");

   t = lGetString(qep, QU_tmpdir);
   if (t == NULL) {
      DRETURN(NULL);
   }

   /* Note could have multiple instantiations of same job, */
   /* on same machine, under same queue */
   sprintf(tmpdir, "%s/"sge_u32"."sge_u32".%s", t, jobid, jataskid, lGetString(qep, QU_qname));

   DPRINTF(("making TMPDIR=%s\n", tmpdir));

   sge_switch2start_user();
   sge_mkdir(tmpdir, 0755, false, false);

   /*
    * chown is considered to be a security flaw, as an attacker might move the 
    * directory between the mkdir and chown.
    * This is both nearly impossible here and would have no effect.
    * Make flawfinder ignore it
    */
   /* Flawfinder: ignore */
   if (chown(tmpdir, uid, gid) != 0) {
      dstring ds = DSTRING_INIT;
      ERROR((SGE_EVENT, MSG_FILE_NOCHOWN_SS, tmpdir, sge_strerror(errno, &ds)));
      sge_dstring_free(&ds);
      unlink(tmpdir);
      DRETURN(NULL);
   }

   sge_switch2admin_user();

   DRETURN(tmpdir);
}
Пример #6
0
/****** uti/stdio/sge_peopen() ************************************************
*  NAME
*     sge_peopen_r() -- Advanced popen()
*
*  SYNOPSIS
*     pid_t sge_peopen_r(const char *shell, int login_shell,
*                        const char *command, const char *user,
*                        char **env, FILE **fp_in, FILE **fp_out,
*                        FILE **fp_err)
*
*  FUNCTION
*     Advanced popen() with additional parameters:
*        - free shell usage
*        - login shell if wanted
*        - user under which to start (for root only)
*        - stdin and stderr file pointers
*        - wait for exactly the process we started
*     File descriptors have to be closed with sge_peclose().
*
*     This function is reentrant as long as env is not provided to
*     this function. This means that the function can be used in 
*     multi thread processed as long as env is not used. 
*
*  INPUTS
*     const char *shell   - which shell to use
*     int login_shell     - make it a login shell?
*     const char *command - name of the program
*     const char *user    - user under which to start (for root only)
*     char **env          - env variables to add to child
*     FILE **fp_in        - file input stream
*     FILE **fp_out       - file output stream
*     FILE **fp_err       - file error stream
*
*  RESULT
*     pid_t - process id
*
*  NOTES
*     MT-NOTE: sge_peopen() is MT safe 
*
*     DO NOT ADD ASYNC SIGNAL UNSAFE FUNCTIONS BETWEEN FORK AND EXEC
*     DUE TO THE FACT THAT THIS FUNCTION WILL BE USED IN QMASTER
*     (MULTITHREADED ENVIRONMENT) THIS MIGHT CAUSE A DEADLOCK 
*     IN A MASTER THREAD. 
*
*  SEE ALSO
*     uti/stdio/sge_peclose()
******************************************************************************/ 
pid_t sge_peopen_r(const char *shell, int login_shell, const char *command,
                   const char *user, char **env,  FILE **fp_in, FILE **fp_out,
                   FILE **fp_err, bool null_stderr)
{
   pid_t pid;
   int pipefds[3][2];
   int i;
   char arg0[256];
#if defined(SOLARIS)
   char err_str[256];
#endif
   struct passwd *pw = NULL;
   uid_t myuid;
   uid_t tuid;
 
   DENTER(TOP_LAYER, "sge_peopen_r");

   if (sge_has_admin_user()) {
      sge_switch2start_user();
   }
   myuid = geteuid();
   tuid = myuid;

   /* 
    * open pipes - close on failure 
    */
   for (i = 0; i < 3; i++) {
      if (pipe(pipefds[i]) != 0) {
         while (--i >= 0) {
            close(pipefds[i][0]);
            close(pipefds[i][1]);
         }
         ERROR((SGE_EVENT, MSG_SYSTEM_FAILOPENPIPES_SS, command, strerror(errno)));
         if (sge_has_admin_user()) {
            sge_switch2admin_user();
         }
         DRETURN(-1);
      }
   }

   /*
    * set arg0 for exec call correctly to that
    * either a normal shell or a login shell will be started
    */
   if (login_shell) {
      strcpy(arg0, "-");
   } else {
      strcpy(arg0, "");
   }
   strcat(arg0, shell);
   DPRINTF(("arg0 = %s\n", arg0));
   DPRINTF(("arg1 = -c\n"));
   DPRINTF(("arg2 = %s\n", command));


   /*
    * prepare the change of the user which might be done after fork()
    * if a user name is provided.
    *
    * this has to be done before the fork() afterwards it might cause
    * a deadlock of the child because getpwnam() is not async-thread safe.
    */
   if (user) {
      struct passwd pw_struct;
      int size = get_pw_buffer_size();
      char *buffer = sge_malloc(size);

      /*
       * get information about the target user
       */
      if (buffer != NULL) {
         pw = sge_getpwnam_r(user, &pw_struct, buffer, size);
         if (pw == NULL) {
            ERROR((SGE_EVENT, MSG_SYSTEM_NOUSERFOUND_SS, user, strerror(errno)));
            FREE(buffer);
            if (sge_has_admin_user()) {
               sge_switch2admin_user();
            }
            DRETURN(-1);
         }
      } else {
         ERROR((SGE_EVENT, MSG_UTI_MEMPWNAM));
         FREE(buffer);
         if (sge_has_admin_user()) {
            sge_switch2admin_user();
         }
         DRETURN(-1);
      }

      DPRINTF(("was able to resolve user\n"));

      /* 
       * only prepare change of user if target user is different from current one
       */
      if (myuid != pw->pw_uid) {
#if !(defined(WIN32) || defined(INTERIX)) /* var not needed */
         int res;
#endif 

         if (myuid != SGE_SUPERUSER_UID) {
            DPRINTF(("only root is allowed to switch to a different user\n"));
            ERROR((SGE_EVENT, MSG_SYSTEM_NOROOTRIGHTSTOSWITCHUSER));
            FREE(buffer);
            DRETURN(-2);
         }                             

         DPRINTF(("Before initgroups\n"));

#if !(defined(WIN32) || defined(INTERIX))  /* initgroups not called */
         res = initgroups(pw->pw_name, pw->pw_gid);
#  if defined(SVR3) || defined(sun)
         if (res < 0)
#  else
         if (res)
#  endif
         {
            ERROR((SGE_EVENT, MSG_SYSTEM_INITGROUPSFORUSERFAILED_ISS, res, user, strerror(errno)));
            FREE(buffer);
            SGE_EXIT(NULL, 1);
         }

         DPRINTF(("Initgroups was successful\n"));

#endif /* WIN32 */
      }
      DPRINTF(("user = %s\n", user));
      DPRINTF(("myuid = %d\n", (int)myuid));
      if (pw != NULL) {
         tuid = pw->pw_uid;
         DPRINTF(("target uid = %d\n", (int)tuid));
      }

      FREE(buffer);
   }

   DPRINTF(("Now process will fork\n"));

#if defined(SOLARIS)
   pid = sge_smf_contract_fork(err_str, 256);
#else
   pid = fork();
#endif

   /* 
    * in the child pid is 0 
    */
   if (pid == 0) {  
      /*
       * close all fd's except that ones mentioned in keep_open 
       */
      int keep_open[6];
      keep_open[0] = 0;
      keep_open[1] = 1;
      keep_open[2] = 2;
      keep_open[3] = pipefds[0][0];
      keep_open[4] = pipefds[1][1];
      keep_open[5] = pipefds[2][1];
      sge_close_all_fds(keep_open, 6);

      /* 
       * shall we redirect stderr to /dev/null? Then
       *    - open "/dev/null"
       *    - set stderr to "dev/null"
       *    - close the stderr-pipe
       * otherwise
       *    - redirect stderr to the pipe
       */
      if (null_stderr) {
         int fd = open("/dev/null", O_WRONLY);

         if (fd != -1) {
            close(2);
            dup(fd);
            close(pipefds[2][1]);
         } else {
            SGE_EXIT(NULL, 1);
         }
      } else {
         close(2);
         dup(pipefds[2][1]);
      }

      /* 
       * redirect stdin and stdout to the pipes 
       */
      close(0);
      close(1);
      dup(pipefds[0][0]);
      dup(pipefds[1][1]);

      if (pw != NULL) {
         int lret = setuid(tuid);
         if (lret) {
            SGE_EXIT(NULL, 1);
         }
      }

      /*
       * set the environment if we got one as argument
       */
      if (env != NULL) {
         if (pw != NULL) {
            addenv("HOME", pw->pw_dir);
            addenv("SHELL", pw->pw_shell);
            addenv("USER", pw->pw_name);
            addenv("LOGNAME", pw->pw_name);
         }
         addenv("PATH", SGE_DEFAULT_PATH);
         for(; *env; env++) {
            putenv(*env);
         }
      }
      execlp(shell, arg0, "-c", command, NULL);
   }
 
   if (pid < 0) {
      for (i=0; i<3; i++) {
         close(pipefds[i][0]);
         close(pipefds[i][1]);
      }
#if defined(SOLARIS)
      if (pid < -1 && err_str) {
          ERROR((SGE_EVENT, MSG_SMF_FORK_FAILED_SS, "sge_peopen()", err_str));
      }
#endif
      if (sge_has_admin_user()) {
         sge_switch2admin_user();
      }
      DRETURN(-1);
   }
 
   /* close the childs ends of the pipes */
   close(pipefds[0][0]);
   close(pipefds[1][1]);
   close(pipefds[2][1]);
  
   /* return filehandles for stdin and stdout */
   *fp_in  = fdopen(pipefds[0][1], "a");
   *fp_out = fdopen(pipefds[1][0], "r");

   /* is stderr redirected to /dev/null? */
   if (null_stderr) {
      /* close the pipe and return NULL as filehandle */
      close(pipefds[2][0]);
      *fp_err = NULL;
   } else {
      /* return filehandle for stderr */
      *fp_err = fdopen(pipefds[2][0], "r");
   }

   if (sge_has_admin_user()) {
      sge_switch2admin_user();
   }
   DRETURN(pid);
}
Пример #7
0
/* Remove the task's controller directory after moving out the
   shepherd and killing anything left.  */
bool
remove_shepherd_cpuset(u_long32 job, u_long32 task, pid_t pid)
{
   char dir[SGE_PATH_MAX], taskfile[SGE_PATH_MAX], spid[PID_BSIZE];
   FILE *fp;
   bool rogue = false;

   DENTER(TOP_LAYER, "remove_shepherd_cpuset");
   snprintf(dir, sizeof dir, "%s/"sge_u32"."sge_u32"/"pid_t_fmt,
            cgroup_dir(cg_cpuset), job, task, pid);
   build_path(taskfile, dir, "tasks");
   /* We should have an empty task list.  If we can't remove it, kill
      anything there.  Arguably this should be repeated in case of a
      race against things spawning if we don't have the freezer cgroup.  */
   errno = 0;
   if (rmdir(dir) == 0) DRETURN(true);
   /* EBUSY means it still has tasks.  */
   if (errno != EBUSY) {
      ERROR((SGE_EVENT, MSG_FILE_RMDIRFAILED_SS, dir, strerror(errno)));
      DRETURN(false);
   }
   if (!(fp = fopen(taskfile, "r"))) {
      WARNING((SGE_EVENT, MSG_FILE_NOOPEN_SS, taskfile, strerror(errno)));
      DRETURN(false);
   }
   while (fgets(spid, sizeof spid, fp)) {
      char buf[MAX_STRING_SIZE], file[SGE_PATH_MAX], *v, *cmd;
      pid_t tgid;
      size_t l;

      replace_char(spid, strlen(spid), '\n', '\0');

      /* Move the task away to avoid waiting for it to die.  */
      /* Fixme:  Keep the cpusetdir tasks open and just write to that.  */
      reparent_proc(spid, cgroup_dir(cg_cpuset));

      /* Kill rogue processes, avoiding the shepherd.  (Shepherd needs
         to be killed exactly once, otherwise sge_reap_children_execd
         is called multiple times.)  Only consider entries in the task
         list that are processes (Tgid == Pid), not threads; this
         copes with old-style cpusets, lacking cgroup.procs.  */
      snprintf(file, sizeof file, "/proc/%s/status", spid);
      v = file_getvalue(buf, MAX_STRING_SIZE, file, "Tgid:");
      if (! v) continue;
      tgid = atoi(v);
      if (strcmp(v, spid)       /* process */
          && (tgid != pid)) {   /* not shepherd */
          if (!rogue)
             WARNING((SGE_EVENT, "rogue process(es) found for task "
    		  sge_u32"."sge_u32, job, task));
          rogue = true;

          /* Extract and log process name */
          snprintf(file, sizeof file, "/proc/%s/cmdline", spid);
          errno = 0;
          l = sizeof buf;
          cmd = dev_file2string(file, buf, &l);
          if (l)
             INFO((SGE_EVENT, "rogue: "SFN2, replace_char(cmd, l, '\0', ' ')));

          sge_switch2start_user();
          kill(tgid, SIGKILL);
          sge_switch2admin_user();
      }
   }
   fclose(fp);
   errno = 0;
   if (rmdir(dir) == 0) DRETURN(true);
   ERROR((SGE_EVENT, MSG_FILE_RMDIRFAILED_SS, dir, strerror(errno)));
   DRETURN(false);
}
Пример #8
0
/*-------------------------------------------------------------------*/
void sge_setup_sge_execd(sge_gdi_ctx_class_t *ctx, const char* tmp_err_file_name)
{
   char err_str[MAX_STRING_SIZE];
   int allowed_get_conf_errors     = 5;
   char* spool_dir = NULL;
   const char *unqualified_hostname = ctx->get_unqualified_hostname(ctx);
   const char *admin_user = ctx->get_admin_user(ctx);

   DENTER(TOP_LAYER, "sge_setup_sge_execd");

   /* TODO: is this the right place to switch the user ?
            ports below 1024 ok */
   /*
   ** switch to admin user
   */
   if (sge_set_admin_username(admin_user, err_str)) {
      CRITICAL((SGE_EVENT, SFNMAX, err_str));
      /* TODO: remove */
      SGE_EXIT(NULL, 1);
   }

   if (sge_switch2admin_user()) {
      CRITICAL((SGE_EVENT, SFNMAX, MSG_ERROR_CANTSWITCHTOADMINUSER));
      /* TODO: remove */
      SGE_EXIT(NULL, 1);
   }

   while (gdi2_wait_for_conf(ctx, &Execd_Config_List)) {
      if (allowed_get_conf_errors-- <= 0) {
         CRITICAL((SGE_EVENT, SFNMAX, MSG_EXECD_CANT_GET_CONFIGURATION_EXIT));
         /* TODO: remove */
         SGE_EXIT(NULL, 1);
      }
      sleep(1);
      ctx->get_master(ctx, true);
   }
   sge_show_conf();         


   /* get aliased hostname */
   /* TODO: is this call needed ? */
   ctx->reresolve_qualified_hostname(ctx);
   spool_dir = mconf_get_execd_spool_dir();

   DPRINTF(("chdir(\"/\")----------------------------\n"));
   sge_chdir_exit("/",1);
   DPRINTF(("Making directories----------------------------\n"));
   sge_mkdir(spool_dir, 0755, 1, 0);
   DPRINTF(("chdir(\"%s\")----------------------------\n", spool_dir));
   sge_chdir_exit(spool_dir,1);
   sge_mkdir(unqualified_hostname, 0755, 1, 0);
   DPRINTF(("chdir(\"%s\",me.unqualified_hostname)--------------------------\n",
            unqualified_hostname));
   sge_chdir_exit(unqualified_hostname, 1); 
   /* having passed the  previous statement we may 
      log messages into the ERR_FILE  */
   if ( tmp_err_file_name != NULL) {
      sge_copy_append((char*)tmp_err_file_name, ERR_FILE, SGE_MODE_APPEND);
   }
   sge_switch2start_user();
   if ( tmp_err_file_name != NULL) {
      unlink(tmp_err_file_name);
   }
   sge_switch2admin_user();
   log_state_set_log_as_admin_user(1);
   sprintf(execd_messages_file, "%s/%s/%s", spool_dir, 
           unqualified_hostname, ERR_FILE);
   log_state_set_log_file(execd_messages_file);

   sprintf(execd_spool_dir, "%s/%s", spool_dir, 
           unqualified_hostname);
   
   DPRINTF(("Making directories----------------------------\n"));
   sge_mkdir(EXEC_DIR, 0775, 1, 0);
   sge_mkdir(JOB_DIR, 0775, 1, 0);
   sge_mkdir(ACTIVE_DIR,  0775, 1, 0);

#if defined(PLPA_LINUX) || defined(SOLARIS86) || defined(SOLARISAMD64)
   /* initialize processor topology */
   if (initialize_topology() != true) {
      DPRINTF(("Couldn't initizalize topology-----------------------\n"));
   }
#endif

   sge_free(&spool_dir);
   DRETURN_VOID;
}
Пример #9
0
/****** shepherd_binding/do_core_binding() ******************
*******************
*  NAME
*     do_core_binding() -- Performs the core binding task for the Solaris OS. 
*
*  SYNOPSIS
*     int do_core_binding(void) 
*
*  FUNCTION
*     Performs core binding on shepherd side. All information required for  
*     the binding is communicated from execd to shepherd in the config 
*     file value "binding". If there is "NULL" no core binding is done. 
*
*     This function is Solaris specific.
*
*     DG TODO change return value to bool
*
*  RESULT
*     int - Returns 0 in case of success and a negative value in case of problems.  
*
*  NOTES
*     MT-NOTE: do_core_binding() is not MT safe 
*
*******************************************************************************/
int do_core_binding(void)
{
   int retval = 0; 

   /* just read out what is in "config" file and attach to the given psrset if 
      it is specified */
   char *binding = get_conf_val("binding");
   
   if (binding == NULL) {
      shepherd_trace("do_core_binding: \"binding\" parameter not found in config file");
      retval = -1;
   } else if (strcasecmp("no_job_binding", binding) == 0 || strcasecmp("NULL", binding) == 0) {
      shepherd_trace("do_core_binding: skip binding - no core binding configured");
      retval = -1;
   }

   if (retval == 0 && strstr(binding, "psrset:") != NULL) {
      int processor_set_id = 0;
      shepherd_trace("do_core_binding: psrset found - attaching to it!");

      /* parse the psrset number right after "psrset:" */
      if (sge_strtok(binding, ":") != NULL) {
         /* parse the rest of the line */
         char* pset_id;
         if ((pset_id = sge_strtok(NULL, ":")) != NULL) {
            /* finally get the processor set id */
            processor_set_id = atoi(pset_id);
         } else {
            shepherd_trace("do_core_binding: couldn't find the psrset id after \"psrset:\" in config file (binding)");
            retval = -1;
         }
      } else {
         shepherd_trace("do_core_binding: found string \"psrset:\" but no \":\" - almost impossible");
         retval = -1;
      }

      if (retval == 0) {
         if (processor_set_id == -1) {            
            /* prcoessor_set_id == -1: Check here for a special processor_set_id (negative; 0)
               which does show that no binding is needed since this processor set
               would require (exactly) all of the remaining cores. Creating 
               such a processor set is not possible because one processor must 
               left for the OS. But the job is implicitly bound to the processors 
               since it can not use any onther processor from the other processor 
               sets. */
            shepherd_trace("do_core_binding: psrset not created since all remaining processors would be used");
            shepherd_trace("do_core_binding: binding is done implicitly");
         } else {
            /* start user rights (root) are required for creating processor sets */
            sge_switch2start_user();
            
            if (bind_shepherd_to_pset(processor_set_id) == false) {
               shepherd_trace("do_core_binding: couldn't bind to existing processor set!");
            } else {
               shepherd_trace("do_core_binding: successfully bound to existing processor set!");
            }
   
            /* switch back to admin user */
            sge_switch2admin_user();
         }
      }

   } else {  /* "psrset" is not in config file defined */
      shepherd_trace("do_core_binding: no processor set found in config file! do nothing");
      retval = -1;
   }

   shepherd_trace("do_core_binding: finishing");

   return retval;
}
Пример #10
0
lListElem *
spool_dynamic_create_context(lList **answer_list, const char *method,
                             const char *shlib_name, const char *args)
{
   bool ok = true;
   lListElem *context = NULL;

   /* shared lib name buffer and handle */
   dstring shlib_dstring = DSTRING_INIT;
   const char *shlib_fullname;
   void *shlib_handle;

   /* get_method function pointer and result */
   spooling_get_method_func get_spooling_method = NULL;
   const char *spooling_name = NULL;

   DENTER(TOP_LAYER, "spool_dynamic_create_context");

   /* build the full name of the shared lib - append architecture dependent
    * shlib postfix 
    */
   shlib_fullname = sge_dstring_sprintf(&shlib_dstring, "%s.%s", shlib_name, 
#if defined(HP11) || defined(HP1164)
                                        "sl"
#elif defined(DARWIN)
                                        "dylib"
#else
                                        "so"
#endif
                                       );

#if defined(HP1164)   
   /*
   ** need to switch to start user for HP
   */
   sge_switch2start_user();
#endif   

   /* open the shared lib */
   # if defined(DARWIN)
   # ifdef RTLD_NODELETE
   shlib_handle = dlopen(shlib_fullname, RTLD_NOW | RTLD_GLOBAL | RTLD_NODELETE);
   # else
   shlib_handle = dlopen(shlib_fullname, RTLD_NOW | RTLD_GLOBAL );
   # endif /* RTLD_NODELETE */
   # else
   # ifdef RTLD_NODELETE
   shlib_handle = dlopen(shlib_fullname, RTLD_NOW | RTLD_NODELETE);
   # else
   shlib_handle = dlopen(shlib_fullname, RTLD_NOW);
   # endif /* RTLD_NODELETE */
   #endif
                        
#if defined(HP1164)   
   /*
   ** switch back to admin user for HP
   */
   sge_switch2admin_user();
#endif

   if (shlib_handle == NULL) {
      answer_list_add_sprintf(answer_list, STATUS_EUNKNOWN, 
                              ANSWER_QUALITY_ERROR, 
                              MSG_SPOOL_ERROROPENINGSHAREDLIB_SS, 
                              shlib_fullname, dlerror());
      ok = false;
   } 

   /* retrieve function pointer of get_method function in shared lib */
   if (ok) {
      dstring get_spooling_method_func_name = DSTRING_INIT;

      sge_dstring_sprintf(&get_spooling_method_func_name,
                          "get_%s_spooling_method", method);

      get_spooling_method = (spooling_get_method_func)
                            dlsym(shlib_handle, 
                            sge_dstring_get_string(&get_spooling_method_func_name));
      if (get_spooling_method == NULL) {
         answer_list_add_sprintf(answer_list, STATUS_EUNKNOWN, 
                                 ANSWER_QUALITY_ERROR, 
                                 MSG_SPOOL_SHLIBDOESNOTCONTAINSPOOLING_SS, 
                                 shlib_fullname, dlerror());
         ok = false;
      }
      sge_dstring_free(&get_spooling_method_func_name);
   }

   /* retrieve name of spooling method in shared lib */
   if (ok) {
      spooling_name = get_spooling_method();

      if (spooling_name == NULL) {
         answer_list_add_sprintf(answer_list, STATUS_EUNKNOWN, 
                                 ANSWER_QUALITY_INFO, 
                                 MSG_SPOOL_SHLIBGETMETHODRETURNSNULL_S, 
                                 shlib_fullname);
         ok = false;
      } else {
         if (strcmp(spooling_name, method) != 0) {
            answer_list_add_sprintf(answer_list, STATUS_EUNKNOWN, 
                                    ANSWER_QUALITY_INFO, 
                                    MSG_SPOOL_SHLIBCONTAINSXWENEEDY_SSS, 
                                    shlib_fullname, spooling_name, method);
            ok = false;
         }
      }
   }

   /* create spooling context from shared lib */
   if (ok) {
      dstring create_context_func_name = DSTRING_INIT;
      spooling_create_context_func create_context;

      answer_list_add_sprintf(answer_list, STATUS_EUNKNOWN, 
                              ANSWER_QUALITY_INFO, 
                              MSG_SPOOL_LOADINGSPOOLINGMETHOD_SS, 
                              spooling_name, shlib_fullname);

      sge_dstring_sprintf(&create_context_func_name, 
                          "spool_%s_create_context", spooling_name);

      create_context = 
            (spooling_create_context_func) 
            dlsym(shlib_handle, 
                  sge_dstring_get_string(&create_context_func_name));
      if (create_context == NULL) {
         answer_list_add_sprintf(answer_list, STATUS_EUNKNOWN, 
                                 ANSWER_QUALITY_ERROR, 
                                 MSG_SPOOL_SHLIBDOESNOTCONTAINSPOOLING_SS,
                                 shlib_fullname, dlerror());
         ok = false;
      } else {
         context = create_context(answer_list, args);
      }
      sge_dstring_free(&create_context_func_name);
   }

   /* cleanup in case of initialization error */
   if (context == NULL) {
      if (shlib_handle != NULL) {
         dlclose(shlib_handle);
      }
   }

   sge_dstring_free(&shlib_dstring);

   DRETURN(context);
}
Пример #11
0
/****** shepherd_error ********************************************************
*  NAME
*     shepherd_error() -- Write a line to the error file and exit program.
*
*  SYNOPSIS
*     void shepherd_error(bool do_exit, const char *format, ...)
*
*  FUNCTION
*     Writes a line to the error file, preceding it with a
*     date, time, uid and pid stamp, and exits the program. stops execution.
*
*  INPUTS
*     do_exit: If true, this function calls exit(2).
*     format: The format string of the line to be written to the error file.
*     ...: The parameters to the format string. See printf(3c).
*
*  RESULT
*     void - none
*******************************************************************************/
void shepherd_error(int do_exit, const char *format, ...)
{
   dstring     ds;
   dstring     message = DSTRING_INIT;
   char        buffer[128];
   char        header_str[256];
   struct stat statbuf;

   if (format != NULL)
   {
      va_list     ap;

      va_start(ap, format);
      sge_dstring_vsprintf(&message, format, ap);
      va_end(ap);
   }

   shepherd_trace(sge_dstring_get_string(&message));

   /* File was closed (e.g. by an exec()) but fp was not set to NULL */
   if (shepherd_error_fp && fstat(fileno(shepherd_error_fp), &statbuf) == -1 && errno==EBADF)
   {
      shepherd_error_fp = NULL;
   }

   if (shepherd_error_fp == NULL)
   {
      shepherd_error_fp = shepherd_trace_init_intern(st_error);
   }

   if (shepherd_error_fp != NULL)
   {
      sge_dstring_init(&ds, buffer, sizeof(buffer));
      sprintf(header_str, "%s ["uid_t_fmt":"pid_t_fmt"]: ",
              sge_ctime(0, &ds), geteuid(), getpid());

      sh_str2file(header_str, sge_dstring_get_string(&message), shepherd_error_fp);
   }

   if (foreground)
   {
      fprintf(stderr, "%s%s\n", header_str, sge_dstring_get_string(&message));
   }

   /* File was closed (e.g. by an exec()) but fp was not set to NULL */
   if (shepherd_exit_status_fp && fstat(fileno(shepherd_exit_status_fp), &statbuf) == -1 && errno==EBADF )
   {
      shepherd_exit_status_fp = NULL;
   }

   if (shepherd_exit_status_fp == NULL)
   {
      shepherd_exit_status_fp = shepherd_trace_init_intern(st_exit_status);
   }

   if (shepherd_exit_status_fp != NULL)
   {
      sprintf(header_str, "%d", shepherd_state);
      sh_str2file(header_str, NULL, shepherd_exit_status_fp);
   }
	
   if (coshepherd_pid > 0)
   {
      sge_switch2start_user();
      kill(coshepherd_pid, SIGTERM);
      sge_switch2admin_user();
   }   
     
   if (g_new_interactive_job_support == false && 
      search_conf_val("qrsh_control_port") != NULL)
   {
      char buffer[1024];
      snprintf(buffer, sizeof(buffer), "1:%s", sge_dstring_get_string(&message));
      write_to_qrsh(buffer);  
   }
   sge_dstring_free(&message);

   if (do_exit)
   {
      /* close all trace files before exit */
      shepherd_trace_exit();
      exit(shepherd_state);
   }

   /* There are cases where we have to open and close the files 
    * for every write.
    */
   if (!g_keep_files_open)
   {
      shepherd_error_exit();
   }
}
Пример #12
0
void setosjobid(pid_t sid, gid_t *add_grp_id_ptr, struct passwd *pw)
{
   FILE *fp=NULL;

   shepherd_trace("setosjobid: uid = "pid_t_fmt", euid = "pid_t_fmt, getuid(), geteuid());

#  if defined(SOLARIS) || defined(ALPHA) || defined(LINUX) || defined(FREEBSD) || defined(DARWIN)
      /* Read SgeId from config-File and create Addgrpid-File */
      {  
         char *cp;
         if ((cp = search_conf_val("add_grp_id")))
            *add_grp_id_ptr = atol(cp);
         else
            *add_grp_id_ptr = 0;
      }
      if ((fp = fopen("addgrpid", "w")) == NULL) {
         shepherd_error(1, "can't open \"addgrpid\" file");   
      }
      fprintf(fp, gid_t_fmt"\n", *add_grp_id_ptr);
      FCLOSE(fp);   
# elif defined(HP1164) || defined(AIX)
    {
      if ((fp = fopen("addgrpid", "w")) == NULL) {
         shepherd_error(1, "can't open \"addgrpid\" file");
      }
      fprintf(fp, pid_t_fmt"\n", getpgrp());
      FCLOSE(fp);
    }
#  else
   {
      char osjobid[100];
      if ((fp = fopen("osjobid", "w")) == NULL) {
         shepherd_error(1, "can't open \"osjobid\" file");
      }

      if(sge_switch2start_user() == 0) {
#     if defined(IRIX)
      {
         /* The following block contains the operations necessary for
          * IRIX6.2 (and later) to set array session handles (ASHs) and
          * service provider info (SPI) records
          */
         struct acct_spi spi;
         int ret;
         char *cp;

         shepherd_trace("in irix code");
         /* get _local_ array session id */
         if ((ret=newarraysess())) {
            shepherd_error(1, "error: can't create ASH; errno=%d", ret);
         }

         /* retrieve array session id we just assigned to the process and
          * write it to the os-jobid file
          */
         sprintf(osjobid, "%lld", getash());
         shepherd_trace(osjobid); 
         /* set service provider information (spi) record */
         strncpy(spi.spi_company, "SGE", 8);
         strncpy(spi.spi_initiator, get_conf_val("spi_initiator"), 8);
         strncpy(spi.spi_origin, get_conf_val("queue"),16);
         strcpy(spi.spi_spi, "Job ");
         strncat(spi.spi_spi, get_conf_val("job_id"),11);
         if ((ret=setspinfo(&spi))) {
            shepherd_error(1, "error: can't set SPI; errno=%d", ret);
         }
         
         if ((cp = search_conf_val("acct_project"))) {
            prid_t proj; 
            if (strcasecmp(cp, "none") && ((proj = projid(cp)) >= 0)) {
               shepherd_trace("setting project \"%s\" to id %lld", cp, proj);
               if (setprid(proj) == -1)
                  shepherd_trace("failed setting project id");
            }
            else {   
               shepherd_trace("can't get id for project \"%s\"", cp);
            }
         } else {
            shepherd_trace("can't get configuration entry for projects");
         }
      }
#     elif defined(CRAY)
      {
         char *cp;
	      {
	         int jobid;

	         if ((jobid=setjob(pw->pw_uid, 0)) < 0) {
	            shepherd_error(1, "error: can't set job ID; errno = %d", errno);
	         }

	         if (sesscntl(jobid, S_ADDFL, S_BATCH) == -1) {
	            shepherd_error(1, "error: sesscntl(%d, S_ADDFL, S_BATCH) failed,"
		                        " errno = %d", sid, errno);
	         } 
	         sprintf(osjobid, "%d", jobid);
	      }

	      if ((cp = search_conf_val("acct_project"))) {
	         int proj; 
	         if (strcasecmp(cp, "none") && ((proj = nam2acid(cp)) >= 0)) {
	            shephed_trace("setting project \"%s\" to acid %d", cp, proj);
	            if (acctid(0, proj) == -1) {
		            shepherd_trace("failed setting project id (acctid)");
               }
	         } else {   
	            shepherd_trace("can't get id for project \"%s\"", cp);
	         }
	      } else {
	         shepherd_trace("can't get configuration entry for projects");
         }
      }
#     elif defined(NECSX4) || defined(NECSX5)
      {
         id_t jobid = 0;
		 	dispset2_t attr;	
			int value;

         /*
          * Create new Super-UX job
          */
         if (setjid() == -1) {
            shepherd_trace("ERROR: can't set jobid: %s[%d]", strerror(errno), errno);
         } else {
            jobid = getjid(0);
            shepherd_trace("Created job with id: "sge_u32, (u_long32) jobid);
         }  
         sprintf(osjobid, sge_u32, (u_long32) jobid); 

         /*
          * We will use limits for the whole job
          */
         set_rlimits_os_job_id(jobid);

         /*
          * The job will use the resources of the configured 
          * Resource Sharing Group (rsg)
          */ 
         {
            char *rsg_id_string;
            int rsg_id;
            char fsg_dev_string[256];

            rsg_id_string  = get_conf_val("processors");
            rsg_id = atoi(rsg_id_string);
            if (rsg_id) {
               int fd;

               sprintf(fsg_dev_string, "/dev/rsg/%d", rsg_id);
               fd = open(fsg_dev_string, O_RDONLY);
               if (fd <= 0) {
                  shepherd_trace("ERROR: can't switch to rsg%d because can't open"
                                 "device: %s[%d]", rsg_id, strerror(errno), errno);
               } else {
                  if (ioctl(fd, RSG_JUMP, NULL) == -1) {
                     close(fd);
                     shepherd_trace("ERROR: can't switch to rsg%d: %s[%d]", 
                                    rsg_id, strerror(errno), errno);
                     return;
                  } else {
                     close(fd);
                     shepherd_trace("switched to rsg%d", rsg_id);
                  }
               }
            } else {
               shepherd_trace("using default rsg", rsg_id);
            }
         } 

         /*
          * Set scheduling parameter for job
          */
         if (((attr.basepri = atoi(get_conf_val("nec_basepriority"))) != NEC_UNDEF_VALUE)
            && ((attr.modcpu = atoi(get_conf_val("nec_modcpu"))) != NEC_UNDEF_VALUE)
            && ((attr.tickcnt = atoi(get_conf_val("nec_tickcnt"))) != NEC_UNDEF_VALUE)
            && ((attr.dcyfctr = atoi(get_conf_val("nec_dcyfctr"))) != NEC_UNDEF_VALUE)
            && ((attr.dcyintvl = atoi(get_conf_val("nec_dcyintvl"))) != NEC_UNDEF_VALUE)
            && ((attr.tmslice = atoi(get_conf_val("nec_timeslice"))) != NEC_UNDEF_VALUE)
            && ((attr.mempri = atoi(get_conf_val("nec_memorypriority"))) != NEC_UNDEF_VALUE)
            && ((attr.szefctmrt = atoi(get_conf_val("nec_mrt_size_effct"))) != NEC_UNDEF_VALUE)
            && ((attr.priefctmrt = atoi(get_conf_val("nec_mrt_pri_effct"))) != NEC_UNDEF_VALUE)
            && ((attr.minmrt = atoi(get_conf_val("nec_mrt_minimum"))) != NEC_UNDEF_VALUE)
            && ((attr.agrange = atoi(get_conf_val("nec_aging_range"))) != NEC_UNDEF_VALUE)
            && ((attr.spinherit = atoi(get_conf_val("nec_slavepriority"))) != NEC_UNDEF_VALUE)
            && ((attr.concpu = atoi(get_conf_val("nec_cpu_count"))) != NEC_UNDEF_VALUE)) {
            if (dispcntl(SG_JID, getjid(0), DCNTL_SET2, &attr) == -1) {
               shepherd_trace("ERROR: can't set scheduling parameter: %s[%d]",
                              strerror(errno), errno);
            } else {
               shepherd_trace("control parameters for active process scheduling modified");
               print_scheduling_parameters(attr);
            }
         } else {
            shepherd_trace("we do not control active process scheduling");
         }
      }               
#     else
         /* write a default os-jobid to file */
         sprintf(osjobid, pid_t_fmt, sid);
#     endif
         sge_switch2admin_user();
      } 
      else /* not running as super user --> we want a default os-jobid */
         sprintf(osjobid, "0");
      
      if(fprintf(fp, "%s\n", osjobid) < 0)
         shepherd_trace("error writing osjobid file");
         
      FCLOSE(fp); /* Close os-jobid file */   
   }
#  endif
   return;
FCLOSE_ERROR:
   shepherd_error(1, "can't close file"); 
}
Пример #13
0
/*----------------------------------------------------------------------------*/
int 
main(int argc, char **argv)
{
   int heartbeat        = 0;
   int last_heartbeat   = 0;
   int latest_heartbeat = 0;
   int ret              = 0;
   int delay            = 0;
   time_t now, last;
/*    const char *cp; */
   char err_str[MAX_STRING_SIZE];
   char shadowd_pidfile[SGE_PATH_MAX];
   dstring ds;
   char buffer[256];
   pid_t shadowd_pid;

#if 1

static int check_interval = CHECK_INTERVAL;
static int get_active_interval = GET_ACTIVE_INTERVAL;
static int delay_time = DELAY_TIME;
static int sge_test_heartbeat = 0;

char binpath[SGE_PATH_MAX];
char oldqmaster[SGE_PATH_MAX];

char shadow_err_file[SGE_PATH_MAX];
char qmaster_out_file[SGE_PATH_MAX];

#endif

   lList *alp = NULL;
   sge_gdi_ctx_class_t *ctx = NULL;

   DENTER_MAIN(TOP_LAYER, "sge_shadowd");
   
   sge_dstring_init(&ds, buffer, sizeof(buffer));
   /* initialize recovery control variables */
   {
      char *s;
      int val;
      if ((s=getenv("SGE_CHECK_INTERVAL")) &&
          sscanf(s, "%d", &val) == 1)
         check_interval = val;
      if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) &&
          sscanf(s, "%d", &val) == 1)
         get_active_interval = val;
      if ((s=getenv("SGE_DELAY_TIME")) &&
          sscanf(s, "%d", &val) == 1)
         delay_time = val;
      if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) &&
          sscanf(s, "%d", &val) == 1)
         sge_test_heartbeat = val;
   }
         
   /* This needs a better solution */
   umask(022);

#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   /* init language output for gettext() , it will use the right language */
   sge_init_language_func((gettext_func_type)        gettext,
                         (setlocale_func_type)      setlocale,
                         (bindtextdomain_func_type) bindtextdomain,
                         (textdomain_func_type)     textdomain);
   sge_init_language(NULL,NULL);   
#endif /* __SGE_COMPILE_WITH_GETTEXT__  */

   log_state_set_log_file(TMP_ERR_FILE_SHADOWD);

   if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) {
      answer_list_output(&alp);
      SGE_EXIT((void**)&ctx, 1);
   }

   /* AA: TODO: change this */
   ctx->set_exit_func(ctx, shadowd_exit_func);
   sge_setup_sig_handlers(SHADOWD);
   
#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   if (ctx->get_qmaster_spool_dir(ctx) != NULL) {
      char *shadowd_name = SGE_SHADOWD;

      /* is there a running shadowd on this host (with unqualified name) */
      sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), 
              ctx->get_unqualified_hostname(ctx));

      DPRINTF(("pidfilename: %s\n", shadowd_pidfile));
      if ((shadowd_pid = sge_readpid(shadowd_pidfile))) {
         DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid)));
         if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) {
            CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid));
            SGE_EXIT((void**)&ctx, 1);
         }
      }

      ctx->prepare_enroll(ctx);

      /* is there a running shadowd on this host (with aliased name) */
      sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), 
              ctx->get_qualified_hostname(ctx));
      DPRINTF(("pidfilename: %s\n", shadowd_pidfile));
      if ((shadowd_pid = sge_readpid(shadowd_pidfile))) {
         DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid)));
         if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) {
            CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid));
            SGE_EXIT((void**)&ctx, 1);
         }
      }  
   } else {
      ctx->prepare_enroll(ctx);
   }

   if (parse_cmdline_shadowd(argc, argv) == 1) {
      SGE_EXIT((void**)&ctx, 0);
   }
   
   if (ctx->get_qmaster_spool_dir(ctx) == NULL) {
      CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx)));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (chdir(ctx->get_qmaster_spool_dir(ctx))) {
      CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx)));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) {
      CRITICAL((SGE_EVENT, SFNMAX, err_str));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (sge_switch2admin_user()) {
      CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER));
      SGE_EXIT((void**)&ctx, 1);
   }

   sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx));
   sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx));
   sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND);
   unlink(TMP_ERR_FILE_SHADOWD);
   log_state_set_log_as_admin_user(1);
   log_state_set_log_file(shadow_err_file);

   {
      int* tmp_fd_array = NULL;
      unsigned long tmp_fd_count = 0;

      if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) {
         sge_daemonize(tmp_fd_array, tmp_fd_count, ctx);
         if (tmp_fd_array != NULL) {
            sge_free(&tmp_fd_array);
         }
      } else {
         sge_daemonize(NULL, 0, ctx);
      }
   }

   /* shadowd pid file will contain aliased name */
   sge_write_pid(shadowd_pidfile);

   starting_up();
   
   sge_setup_sig_handlers(SHADOWD);

   last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30);

   last = (time_t) sge_get_gmt(); /* set time of last check time */

   delay = 0;
   while (!shut_me_down) {
      sleep(check_interval);

      /* get current heartbeat file content */
      heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30);

      now = (time_t) sge_get_gmt();


      /* Only check when we could read the heartbeat file at least two times
       * (last_heartbeat and heartbeat) without error 
       */
      if (last_heartbeat > 0 && heartbeat > 0) {

         /*
          * OK we have to heartbeat entries to check. Check times ...
          * now  = current time
          * last = last check time
          */
         if ( (now - last) >= (get_active_interval + delay) ) {

            delay = 0;
            if (last_heartbeat == heartbeat) {
               DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last)));
               delay = delay_time; /* set delay time */

               /*
                * check if we are a possible new qmaster host (lock file of qmaster active, etc.)
                */
               ret = check_if_valid_shadow(binpath, oldqmaster, 
                                           ctx->get_act_qmaster_file(ctx), 
                                           ctx->get_shadow_master_file(ctx), 
                                           ctx->get_qualified_hostname(ctx), 
                                           ctx->get_binary_path(ctx));

               if (ret == 0) {
                  /* we can start a qmaster on this host */
                  if (qmaster_lock(QMASTER_LOCK_FILE)) {
                     ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER));
                  } else {
                     int out, err;

                     /* still the old qmaster name in act_qmaster file and still the old heartbeat */
                     latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30);
                     /* TODO: what do we when there is a timeout ??? */
                     DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n"));
                     if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) &&
                         !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && 
                         (latest_heartbeat == heartbeat)) {
                        char qmaster_name[256];

                        strcpy(qmaster_name, SGE_PREFIX);
                        strcat(qmaster_name, prognames[QMASTER]); 
                        DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); 

                        /*
                         * open logfile as admin user for initial qmaster/schedd 
                         * startup messages
                         */
                        out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 
                                   0644);
                        err = out;
                        if (out == -1) {
                           /*
                            * First priority is the master restart
                            * => ignore this error
                            */
                           out = 1;
                           err = 2;
                        } 

                        sge_switch2start_user();
                        ret = startprog(out, err, NULL, binpath, qmaster_name, NULL);
                        sge_switch2admin_user();
                        if (ret) {
                           ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER));
                        }
                        close(out);
                     } else {
                        qmaster_unlock(QMASTER_LOCK_FILE);
                     }
                  }      
               } else {
                  if (ret == -1) {
                     /* just log the more important failures */    
                     WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) ));
                  }
               } 
            }
            /* Begin a new interval, set timers and hearbeat to current values */
            last = now;
            last_heartbeat = heartbeat;
         }
      } else {
         if (last_heartbeat < 0 || heartbeat < 0) {
            /* There was an error reading heartbeat or last_heartbeat */
            DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n",
                     sge_u32c(last_heartbeat), sge_u32c(heartbeat)));
         } else {
            DPRINTF(("have to read the heartbeat file twice to check time differences\n"));
         }
      }
   }

   sge_shutdown((void**)&ctx, 0);

   DRETURN(EXIT_SUCCESS);
}