Example #1
0
/****** uti/sge_log/sge_do_log() ***********************************************
*  NAME
*     sge_do_log() -- Write message to log file 
*
*  SYNOPSIS
*     static void sge_do_log(int aLevel, const char *aMessage, const char 
*
*  FUNCTION
*     ??? 
*
*  INPUTS
*     int aLevel           - log level
*     const char *aMessage - log message
*
*  RESULT
*     void - none
*
*  NOTES
*     MT-NOTE: sge_do_log() is MT safe.
*
*******************************************************************************/
static void sge_do_log(u_long32 me, const char* progname, const char* unqualified_hostname,
                       int aLevel, const char *aMessage) 
{
   int fd;

   if (me == QMASTER || me == EXECD || me == SCHEDD || me == SHADOWD) {
      if ((fd = SGE_OPEN3(log_state_get_log_file(), O_WRONLY | O_APPEND | O_CREAT, 0666)) >= 0) {
         char msg2log[4*MAX_STRING_SIZE];
         dstring msg;
         
         sge_dstring_init(&msg, msg2log, sizeof(msg2log));

         append_time((time_t)sge_get_gmt(), &msg, false); 

         sge_dstring_sprintf_append(&msg, "|%6.6s|%s|%c|%s\n",
                 progname,
                 unqualified_hostname,
                 aLevel,
                 aMessage);

         write(fd, msg2log, strlen(msg2log));
         close(fd);
      }
   }   

   return;
} /* sge_do_log() */
Example #2
0
/****** uti/sge_log/sge_do_log() ***********************************************
*  NAME
*     sge_do_log() -- Write message to log file 
*
*  SYNOPSIS
*     static void sge_do_log(int aLevel, const char *aMessage, const char 
*
*  FUNCTION
*     ??? 
*
*  INPUTS
*     int aLevel           - log level
*     const char *aMessage - log message
*
*  RESULT
*     void - none
*
*  NOTES
*     MT-NOTE: sge_do_log() is MT safe.
*
*******************************************************************************/
static void sge_do_log(u_long32 me, const char* progname, const char* unqualified_hostname,
                       int aLevel, const char *aMessage)
{
   int fd;

   if (me == QMASTER || me == EXECD || me == SCHEDD || me == SHADOWD) {
      if ((fd = SGE_OPEN3(log_state_get_log_file(), O_WRONLY | O_APPEND | O_CREAT, 0666)) >= 0) {
         char msg2log[4*MAX_STRING_SIZE];
         dstring msg;
         int len;

         sge_dstring_init(&msg, msg2log, sizeof(msg2log));

         append_time((time_t)sge_get_gmt(), &msg, false);

         sge_dstring_sprintf_append(&msg, "|%6.6s|%s|%c|%s\n",
                 progname,
                 unqualified_hostname,
                 aLevel,
                 aMessage);

         len = strlen(msg2log);
         if (write(fd, msg2log, len) != len) {
            /* we are in error logging here - the only chance to log this problem
             * might be to write it to stderr
             */
            fprintf(stderr, "can't log to file %s: %s\n", log_state_get_log_file(), sge_strerror(errno, &msg));
         }
         close(fd);
      }
   }

   return;
} /* sge_do_log() */
static void has_finished(const char* str, double time) 
{
   DENTER(TOP_LAYER, "has_finished");

   sge_mutex_lock("has_finished", SGE_FUNC, __LINE__, &Control.mutex);
   Control.working--;
   Control.time += time;

   if (Control.working == 0) {
      Control.working = threads;
      Control.time /= threads;
      printf("%s : %fs\n", str, Control.time);
      Control.time = 0.0;
      pthread_cond_broadcast(&Control.cond_var);
      
   }
   else {
      struct timespec ts;
      ts.tv_sec = (long) (sge_get_gmt() + 180);
      ts.tv_nsec = 0; 
      pthread_cond_timedwait(&Control.cond_var,
                             &Control.mutex, &ts);
   }
   
   sge_mutex_unlock("has_finished", SGE_FUNC, __LINE__, &Control.mutex);
   
   DEXIT;
}
Example #4
0
/*
   select and suspend jobs in susp_queues 
*/
void 
suspend_job_in_queues( lList *susp_queues, lList *job_list, order_t *orders) 
{
   u_long32 now;
   int i, found;
   lListElem *jep = NULL, *ja_task = NULL;
   lListElem *qep;

   DENTER(TOP_LAYER, "suspend_job_in_queues");

   now = sge_get_gmt();
   for_each (qep, susp_queues) {
      u_long32 interval;      

      /* are suspend thresholds enabled? */
      parse_ulong_val(NULL, &interval, TYPE_TIM,
                  lGetString(qep, QU_suspend_interval), NULL, 0);

      if (interval == 0
          || !lGetUlong(qep, QU_nsuspend)
          || !lGetList(qep, QU_suspend_thresholds)) {
         continue;
      } 

      /* check time stamp */
      if (lGetUlong(qep, QU_last_suspend_threshold_ckeck) && 
         (lGetUlong(qep, QU_last_suspend_threshold_ckeck) + interval> now)) {
         continue;
      }

      for (i = 0, found = 1; 
           i < (int) lGetUlong(qep, QU_nsuspend) && found; 
           i++) {
         found = 0;
         /* find one running job in suspend queue */
         if (select4suspension(job_list, qep, &jep, &ja_task))
            break;

         /* generate suspend order for found job */
         found = 1;
         orders->configOrderList = sge_create_orders(orders->configOrderList, 
                                                     ORT_suspend_on_threshold, 
                                                     jep, ja_task, NULL, true);

         DPRINTF(("++++ suspending job "sge_u32"/"sge_u32" on threshold\n", 
                  lGetUlong(jep, JB_job_number), lGetUlong(ja_task, JAT_task_number)));

         /* prevent multiple selection of this job */
         lSetUlong(ja_task, JAT_state, 
                   lGetUlong(ja_task, JAT_state) | JSUSPENDED_ON_THRESHOLD);
      }

      if (i==0 && !found) {
         DPRINTF(("found no jobs for sot in queue %s\n", lGetString(qep, QU_full_name)));
      }
   }
/****** test_sge_lock_simple/thread_function() *********************************
*  NAME
*     thread_function() -- Thread function to execute 
*
*  SYNOPSIS
*     static void* thread_function(void *anArg) 
*
*  FUNCTION
*     Lock the global lock in read mode and sleep. Unlock the global lock. 
*
*  INPUTS
*     void *anArg - thread function arguments 
*
*  RESULT
*     static void* - none
*
*  SEE ALSO
*     test_sge_lock_simple/get_thrd_func()
*******************************************************************************/
static void *thread_function(void *anArg)
{
   DENTER(TOP_LAYER, "thread_function");
   
   SGE_LOCK(LOCK_GLOBAL, LOCK_READ);

#if 1
   DPRINTF(("Thread %u sleeping at %d\n", sge_locker_id(), sge_get_gmt()));
#endif
   sleep(5);

   SGE_UNLOCK(LOCK_GLOBAL, LOCK_READ);

   DEXIT;
   return (void *)NULL;
} /* thread_function */
static void sge_scheduler_wait_for_event(sge_evc_class_t *evc, lList **event_list)
{
   int wait_ret;
   bool do_ack = false;

   DENTER(TOP_LAYER, "sge_scheduler_wait_for_event");


   sge_mutex_lock("event_control_mutex", SGE_FUNC, __LINE__, &Scheduler_Control.mutex);

   if (!Scheduler_Control.triggered) {
      struct timespec ts;
      u_long32 current_time = sge_get_gmt();
      ts.tv_sec = (long) current_time + SCHEDULER_TIMEOUT_S;
      ts.tv_nsec = SCHEDULER_TIMEOUT_N;

      wait_ret = pthread_cond_timedwait(&Scheduler_Control.cond_var, &Scheduler_Control.mutex, &ts);

      /*
       * if pthread_cond_timedwait returns 0, we were triggered by event master
       * otherwise we ran into a timeout or an error
       */
      if (wait_ret != 0) {
         DPRINTF(("pthread_cond_timedwait for events failed %d\n", wait_ret));
      }
   }

   if (Scheduler_Control.triggered) {
      *event_list = Scheduler_Control.new_events;
      Scheduler_Control.new_events = NULL;
      Scheduler_Control.triggered = false;
      do_ack = true;
   }

   sge_mutex_unlock("event_control_mutex", SGE_FUNC, __LINE__, &Scheduler_Control.mutex);

   if (do_ack) {
      if (lGetElemUlong(*event_list, ET_type, sgeE_ACK_TIMEOUT) != NULL) {
         evc->ec_mark4registration(evc);
      }
      evc->ec_ack(evc);
   }

   DRETURN_VOID;
}
int schedd_log(const char *logstr, lList **monitor_alpp, bool monitor_next_run)
{
   DENTER(TOP_LAYER, "schedd_log");

   if (monitor_alpp != NULL) {
      /* add to answer list for verification (-w v) */
      answer_list_add(monitor_alpp, logstr, STATUS_ESEMANTIC, ANSWER_QUALITY_ERROR);
   }

   if (monitor_next_run) {
      /* do logging (-tsm) */
      time_t now;
      FILE *fp = NULL;
      char *time_str = NULL;
      char str[128];

      now = (time_t)sge_get_gmt();
      time_str =  ctime_r(&now, str);
      if (time_str[strlen(time_str) - 1] == '\n') {
         time_str[strlen(time_str) - 1] = '|';
      }

      fp = fopen(schedd_log_file, "a");
      if (!fp) {
         DPRINTF(("could not open schedd_log_file "SFQ"\n", schedd_log_file));
         DRETURN(-1);
      }

      fprintf(fp, "%s", time_str);
      fprintf(fp, "%s\n", logstr);
      FCLOSE(fp);
   }

   DRETURN(0);
FCLOSE_ERROR:
   DPRINTF((MSG_FILE_NOCLOSE_SS, schedd_log_file, strerror(errno)));
   DRETURN(-1);
}
Example #8
0
int
krb_renew_tgts(
lList *joblist 
) {
   krb5_error_code rc;
   static u_long32 next_time = 0;
   u_long32 now = sge_get_gmt();
   lListElem *job;
   krb5_context context = krb_context();
   krb5_timestamp time_now;
   krb_global_data_t *gsd = krb_gsd();

   DENTER(TOP_LAYER, "krb_renew_tgts");


   if ((now = sge_get_gmt())<next_time) {
      DEXIT;
      return 0;
   }

   if ((rc = krb5_timeofday(context, &time_now))) {
      ERROR((SGE_EVENT, MSG_KRB_KRB5TIMEOFDAYFAILEDX_S ,
	     error_message(rc)));
      DEXIT;
      return -1;
   }

   /* renew job TGT's */

   for_each(job, joblist) {

      krb5_error_code rc;
      krb5_creds ** tgt_creds = NULL;
      krb5_data tgtbuf;
      const char *tgtstr = NULL;

      tgtbuf.length = 0;

      /* get TGT out of job entry */

      if ((tgtstr = lGetString(job, JB_tgt))) {

         tgtbuf.data = krb_str2bin(tgtstr, NULL, &tgtbuf.length);

         if (tgtbuf.length) {

            /* decrypt the TGT using the daemon key */

            if ((rc = krb_decrypt_tgt_creds(&tgtbuf, &tgt_creds))) {

               ERROR((SGE_EVENT, MSG_KRB_COULDNOTDECRYPTTGTFORJOBXY_DS,
                      sge_u32c(lGetUlong(job, JB_job_number)),
                      error_message(rc)));

            }

	    if (rc == 0 && tgt_creds) {

               krb5_creds *tgt = *tgt_creds;

	       /*
		* If TGT is renewable and TGT expiration time is not past
		* and is within the SGE renewal threshold and the TGT
		* renewal period is not past, then renew the TGT
		*/

	       if (tgt->ticket_flags & KDC_OPT_RENEWABLE &&
		   tgt->times.endtime > time_now &&
		   tgt->times.renew_till > time_now &&
		   tgt->times.endtime < time_now + gsd->tgt_renew_threshold) {

		  krb5_creds *new_creds[2];
		  krb5_creds creds;

		  memset(new_creds, 0, sizeof(new_creds));
		  memset(&creds, 0 ,sizeof(creds));

		  /* renew the TGT */

		  if (((rc = krb5_copy_principal(context, (*tgt_creds)->server,
			&creds.server))) ||
		      ((rc = krb5_copy_principal(context, (*tgt_creds)->client,
			&creds.client))) ||
		      ((rc = krb5_get_cred_via_tkt(context, tgt,
		        FLAGS2OPTS(tgt->ticket_flags)|KDC_OPT_RENEW,
		        tgt->addresses, &creds, &new_creds[0])))) {

		     ERROR((SGE_EVENT, MSG_KRB_COULDNOTRENEWTGTFORJOBXY_DS, 
                  sge_u32c(lGetUlong(job, JB_job_number)),
			         error_message(rc)));

		  }

		  krb5_free_cred_contents(context, &creds);

		  if (rc == 0) {
                     krb5_data outbuf;

		     /* store the new TGT back into the job entry */

		     outbuf.length = 0;

		     if ((rc = krb_encrypt_tgt_creds(new_creds, &outbuf))) {

			ERROR((SGE_EVENT, MSG_KRB_COULDNOTECRYPTTGTFORJOBXY_DS,
                sge_u32c(lGetUlong(job, JB_job_number)),
			       error_message(rc)));

		     } else {

			lSetString(job, JB_tgt,
			      krb_bin2str(outbuf.data, outbuf.length, NULL));
                     }

		     /* if we are called by the execd, also store the
			new TGT in the credentials cache of the user */

                     if (!strcmp(prognames[EXECD], gsd->progname)) {
                         
                        int retries = MAX_NIS_RETRIES;
			struct passwd *pw = NULL;

			while (retries-- && !pw)
			   pw = getpwnam(lGetString(job, JB_owner));

			if (pw) {

			   if ((krb_store_forwarded_tgt(pw->pw_uid,
				    lGetUlong(job, JB_job_number),
				    new_creds))) {

			       ERROR((SGE_EVENT, MSG_KRB_COULDNOTSTORERENEWEDTGTFORXJOBY_SD,
                                      lGetString(job, JB_owner),
				      sge_u32c(lGetUlong(job, JB_job_number))));
                           }

                        } else {

			   ERROR((SGE_EVENT, MSG_KRB_COULDNOTGETUSERIDFORXY_SD , lGetString(job, JB_owner),
				  sge_u32c(lGetUlong(job, JB_job_number))));
                        }
		     }

		     if (outbuf.length)
			krb5_xfree(outbuf.data);

		  }

      if (!mconf_get_simulate_jobs()) {
         job_write_spool_file(job, 0, NULL, SPOOL_DEFAULT);;
      }

		  if (new_creds[0])
		     krb5_free_creds(context, new_creds[0]);

	       }
	    }
         }

         if (tgtbuf.length)
            krb5_xfree(tgtbuf.data);

         if (tgt_creds)
            krb5_free_tgt_creds(context, tgt_creds);

      }

   }
Example #9
0
/****** execd/sge_execd_register_at_qmaster() **********************************
*  NAME
*     sge_execd_register_at_qmaster() -- modify execd list at qmaster site
*
*  SYNOPSIS
*     int sge_execd_register_at_qmaster(void) 
*
*  FUNCTION
*     add local execd name to SGE_EH_LIST in order to register at
*     qmaster
*
*  INPUTS
*     void - no input
*
*  RESULT
*     int - 0 = success / 1 = error
*
*  NOTES
*     MT-NOTE: sge_execd_register_at_qmaster() is not MT safe 
*
*******************************************************************************/
int sge_execd_register_at_qmaster(sge_gdi_ctx_class_t *ctx, bool is_restart) {
   int return_value = 0;
   static int sge_last_register_error_flag = 0;
   lList *alp = NULL;

   /* 
    * If it is a reconnect (is_restart == true) the act_qmaster file must be
    * re-read in order to update ctx qmaster cache when master migrates. 
    */
   const char *master_host = ctx->get_master(ctx, is_restart);

   DENTER(TOP_LAYER, "sge_execd_register_at_qmaster");

   /* We will not try to make a gdi request when qmaster is not alive. The
    * gdi will return with timeout after one minute. If qmaster is not alive
    * we will not try a gdi request!
    */
   if (master_host != NULL && ctx->is_alive(ctx) == CL_RETVAL_OK) {
      lList *hlp = lCreateList("exechost starting", EH_Type);
      lListElem *hep = lCreateElem(EH_Type);
      lSetUlong(hep, EH_featureset_id, feature_get_active_featureset_id());
      lAppendElem(hlp, hep);

      /* register at qmaster */
      DPRINTF(("*****  Register at qmaster   *****\n"));
      if (!is_restart) {
         /*
          * This is a regular startup.
          */
         alp = ctx->gdi(ctx, SGE_EH_LIST, SGE_GDI_ADD, &hlp, NULL, NULL);
      } else {
         /*
          * Indicate this is a restart to qmaster.
          * This is used for the initial_state of queue_configuration implementation.
          */
         alp = ctx->gdi(ctx, SGE_EH_LIST, SGE_GDI_ADD | SGE_GDI_EXECD_RESTART,
                        &hlp, NULL, NULL);
      }
      lFreeList(&hlp);
   } else {
      DPRINTF(("*****  Register at qmaster - qmaster not alive!  *****\n"));
   }

   if (alp == NULL) {
      if (sge_last_register_error_flag == 0) {
         WARNING((SGE_EVENT, MSG_COM_CANTREGISTER_SS, master_host?master_host:"", MSG_COM_ERROR));
         sge_last_register_error_flag = 1;
      }
      return_value = 1;
   } else {
      lListElem *aep = lFirst(alp);
      if (lGetUlong(aep, AN_status) != STATUS_OK) {
         if (sge_last_register_error_flag == 0) {
            WARNING((SGE_EVENT, MSG_COM_CANTREGISTER_SS, master_host?master_host:"", lGetString(aep, AN_text)));
            sge_last_register_error_flag = 1;
         }
         return_value = 1;
      }
   }
 
   if (return_value == 0) {
      sge_last_register_error_flag = 0;
      INFO((SGE_EVENT, MSG_EXECD_REGISTERED_AT_QMASTER_S, master_host?master_host:""));
      last_qmaster_registration_time = sge_get_gmt();
   }
   lFreeList(&alp);
   DRETURN(return_value);
}
Example #10
0
/****** qmaster/sge_qmaster_main/main() ****************************************
*  NAME
*     main() -- qmaster entry point 
*
*  SYNOPSIS
*     int main(int argc, char* argv[]) 
*
*  FUNCTION
*     Qmaster entry point.
*
*     NOTE: The main thread must block all signals before any additional thread
*     is created. Failure to do so will ruin signal handling!
*
*  INPUTS
*     int argc     - number of commandline arguments 
*     char* argv[] - commandline arguments 
*
*  RESULT
*     0 - success 
*
*  NOTES
*     We check whether 'SGE_ROOT' is set before we daemonize. Once qmaster is
*     a daemon, we are no longer connected to a terminal and hence can not
*     output an error message to stdout or stderr.
*
*     We need to inovke 'prepare_enroll()' *before* the user id is switched via
*     'become_admin_user()'. This is because qmaster must be able to bind a so
*     called reserved port (requires root privileges) if configured to do so.
*
*******************************************************************************/
int main(int argc, char* argv[])
{
   int max_enroll_tries;
   int ret_val;
   int file_descriptor_settings_result = 0;
   bool has_daemonized = false;
   sge_gdi_ctx_class_t *ctx = NULL;
   u_long32 start_time = sge_get_gmt();
   monitoring_t monitor;

   DENTER_MAIN(TOP_LAYER, "qmaster");

   sge_monitor_init(&monitor, "MAIN", NONE_EXT, MT_WARNING, MT_ERROR);
   prof_mt_init();

   sge_get_root_dir(true, NULL, 0, true);
   
#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   sge_init_language_func((gettext_func_type)gettext, (setlocale_func_type)setlocale, (bindtextdomain_func_type)bindtextdomain, (textdomain_func_type)textdomain);
   sge_init_language(NULL,NULL);   
#endif 

   /* 
    * qmaster doesn't support any commandline anymore,
    * but we should show version string and -help option 
    */
   if (argc != 1) {
      sigset_t sig_set;
      sigfillset(&sig_set);
      pthread_sigmask(SIG_SETMASK, &sig_set, NULL);
      sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, true);
      sge_process_qmaster_cmdline(argv);
      SGE_EXIT((void**)&ctx, 1);
   }

   /*
    * daemonize qmaster
    * set file descriptor limits
    * and initialize libraries to be used in multi threaded environment
    * also take care that finished child processed of this process become
    * zombie jobs
    */
   has_daemonized = sge_daemonize_qmaster();
   file_descriptor_settings_result = set_file_descriptor_limit();
#if !defined(INTERIX) && !defined(CYGWIN)
   init_sig_action_and_mask();
#endif

   /* init qmaster threads without becomming admin user */
   sge_qmaster_thread_init(&ctx, QMASTER, MAIN_THREAD, false);

   ctx->set_daemonized(ctx, has_daemonized);

   /* this must be done as root user to be able to bind ports < 1024 */
   max_enroll_tries = 30;
   while (cl_com_get_handle(prognames[QMASTER],1) == NULL) {
      ctx->prepare_enroll(ctx);
      max_enroll_tries--;
      if (max_enroll_tries <= 0) {
         /* exit after 30 seconds */
         CRITICAL((SGE_EVENT, MSG_QMASTER_COMMUNICATION_ERRORS ));
         SGE_EXIT((void**)&ctx, 1);
      }
      if (cl_com_get_handle(prognames[QMASTER],1) == NULL) {
        /* sleep when prepare_enroll() failed */
        sleep(1);
      }
   }

   /*
    * now the commlib up and running. Set qmaster application status function 
    * (commlib callback function for qping status information response 
    *  messages (SIRM))
    */
   ret_val = cl_com_set_status_func(sge_qmaster_application_status);
   if (ret_val != CL_RETVAL_OK) {
      ERROR((SGE_EVENT, cl_get_error_text(ret_val)));
   }

   /* 
    * now we become admin user change into the correct root directory set the
    * the target for logging messages
    */
   sge_become_admin_user(ctx->get_admin_user(ctx));
   sge_chdir_exit(ctx->get_qmaster_spool_dir(ctx), 1);
   log_state_set_log_file(ERR_FILE);
   ctx->set_exit_func(ctx, sge_exit_func);

#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   /*
    * We do increment the heartbeat manually here. This is the 'startup heartbeat'. 
    * The first time the hearbeat will be incremented through the heartbeat event 
    * handler is after about HEARTBEAT_INTERVAL seconds. The hardbeat event handler
    * is setup during the initialisazion of the timer thread.
    */
   inc_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, HEARTBEAT_INTERVAL, NULL);
     
   /*
    * Event master module has to be initialized already here because
    * sge_setup_qmaster() might already access it although event delivery
    * thread is not running.
    *
    * Corresponding shutdown is done in sge_event_master_terminate();
    *
    * EB: In my opinion the init function should called in
    * sge_event_master_initialize(). Is it possible to move that call?
    */ 
   sge_event_master_init();

   sge_setup_qmaster(ctx, argv);

#ifndef USE_POLL
   if (file_descriptor_settings_result == 1) {
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_LARGER_THAN_LIMIT_U, sge_u32c(FD_SETSIZE)));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE1_U, sge_u32c(FD_SETSIZE - 20)));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE2));
      WARNING((SGE_EVENT, MSG_QMASTER_FD_SETSIZE_COMPILE_MESSAGE3));
   }
#endif

   /*
    * Setup all threads and initialize corresponding modules. 
    * Order is important!
    */
   sge_signaler_initialize(ctx);
   sge_event_master_initialize(ctx);
   sge_timer_initialize(ctx, &monitor);
   sge_worker_initialize(ctx);
#if 0
   sge_test_initialize(ctx);
#endif
   sge_listener_initialize(ctx);
   sge_scheduler_initialize(ctx, NULL);
#ifndef NO_JNI
   sge_jvm_initialize(ctx, NULL);
#endif

   INFO((SGE_EVENT, "qmaster startup took "sge_u32" seconds", sge_get_gmt() - start_time));

   /*
    * Block till signal from signal thread arrives us
    */
   sge_thread_wait_for_signal();

   /* 
    * Shutdown all threads and shutdown corresponding modules.
    * Order is important!
    */
#ifndef NO_JNI
   sge_jvm_terminate(ctx, NULL);
#endif
   sge_scheduler_terminate(ctx, NULL);
   sge_listener_terminate();
#if 0
   sge_test_terminate(ctx);
#endif
   sge_worker_terminate(ctx);
   sge_timer_terminate();
   sge_event_master_terminate();
   sge_signaler_terminate();

   /*
    * Remaining shutdown operations
    */
   sge_clean_lists();
   sge_monitor_free(&monitor);

   sge_shutdown((void**)&ctx, sge_qmaster_get_exit_state());
   sge_prof_cleanup();

   DEXIT;
   return 0;
} /* main() */
/****** sge_advance_reservation/ar_validate() **********************************
*  NAME
*     ar_validate() -- validate a advance reservation
*
*  SYNOPSIS
*     bool ar_validate(lListElem *ar, lList **alpp, bool in_master)
*
*  FUNCTION
*     Ensures a new ar has valid start and end times
*
*  INPUTS
*     lListElem *ar   - the ar to check
*     lList **alpp    - answer list pointer
*     bool in_master  - are we in qmaster?
*     bool is_spool   - do we validate for spooling? 
*
*  RESULT
*     bool - true if OK, else false
*
*  NOTES
*     MT-NOTE: ar_validate() is MT safe
*******************************************************************************/
bool ar_validate(lListElem *ar, lList **alpp, bool in_master, bool is_spool)
{
   u_long32 start_time;
   u_long32 end_time;
   u_long32 duration;
   u_long32 now = sge_get_gmt();
   object_description *object_base = object_type_get_object_description();
   
   DENTER(TOP_LAYER, "ar_validate");

   /*   AR_start_time, SGE_ULONG        */
   if ((start_time = lGetUlong(ar, AR_start_time)) == 0) {
      start_time = now;
      lSetUlong(ar, AR_start_time, start_time);
   }

   /*   AR_end_time, SGE_ULONG        */
   end_time = lGetUlong(ar, AR_end_time);
   duration = lGetUlong(ar, AR_duration);
   
   if (end_time == 0 && duration == 0) {
      answer_list_add_sprintf(alpp, STATUS_EEXIST, ANSWER_QUALITY_ERROR,
                              MSG_AR_MISSING_VALUE_S, "end time or duration");
      goto ERROR;
   } else if (end_time == 0) {
      end_time = duration_add_offset(start_time, duration);
      duration = end_time  - start_time;
      lSetUlong(ar, AR_end_time, end_time);
      lSetUlong(ar, AR_duration, duration);
   } else if (duration == 0) {
      duration = end_time - start_time;
      lSetUlong(ar, AR_duration, duration);
   }

   if ((end_time - start_time) != duration) {
      answer_list_add_sprintf(alpp, STATUS_EEXIST, ANSWER_QUALITY_ERROR,
                              MSG_AR_START_END_DURATION_INVALID);
      goto ERROR;
   }

   if (start_time > end_time) {
      answer_list_add_sprintf(alpp, STATUS_EEXIST, ANSWER_QUALITY_ERROR,
                              MSG_AR_START_LATER_THAN_END);
      goto ERROR;
   }
   
   if (!is_spool) {
      if (start_time < now) {
         answer_list_add_sprintf(alpp, STATUS_EEXIST, ANSWER_QUALITY_ERROR,
                                 MSG_AR_START_IN_PAST);
         goto ERROR;
      }
   }
   /*   AR_owner, SGE_STRING */
   
   if (in_master) {
      /*    AR_name, SGE_STRING */
      NULL_OUT_NONE(ar, AR_name);
      if (object_verify_name(ar, alpp, AR_name, SGE_OBJ_AR)) {
         goto ERROR;
      }
      /*   AR_account, SGE_STRING */
      NULL_OUT_NONE(ar, AR_account);
      if (!lGetString(ar, AR_account)) {
         lSetString(ar, AR_account, DEFAULT_ACCOUNT);
      } else {
         if (verify_str_key(alpp, lGetString(ar, AR_account), MAX_VERIFY_STRING,
         "account string", QSUB_TABLE) != STATUS_OK) {
            goto ERROR;
         }
      }
      /*   AR_verify, SGE_ULONG              just verify the reservation or final case */
      /*   AR_error_handling, SGE_ULONG      how to deal with soft and hard exceptions */
      /*   AR_checkpoint_name, SGE_STRING    Named checkpoint */
      NULL_OUT_NONE(ar, AR_checkpoint_name);
      {
         /* request for non existing ckpt object will be refused */
         const char *ckpt_name = NULL;

         ckpt_name = lGetString(ar, AR_checkpoint_name);
         if (ckpt_name != NULL) {
            lList *master_ckpt_list = *object_base[SGE_TYPE_CKPT].list;
            lListElem *ckpt_ep = ckpt_list_locate(master_ckpt_list, ckpt_name);
            if (!ckpt_ep) {
               ERROR((SGE_EVENT, MSG_JOB_CKPTUNKNOWN_S, ckpt_name));
               answer_list_add(alpp, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR);
               goto ERROR;
            }
          }
      }
      /*   AR_resource_list, SGE_LIST */
      {
         lList *master_centry_list = *object_base[SGE_TYPE_CENTRY].list;

         if (centry_list_fill_request(lGetList(ar, AR_resource_list),
                                      alpp, master_centry_list, false, true,
                                      false)) {
            goto ERROR;
         }
         if (compress_ressources(alpp, lGetList(ar, AR_resource_list), SGE_OBJ_AR)) {
            goto ERROR;
         }
         
         if (!centry_list_is_correct(lGetList(ar, AR_resource_list), alpp)) {
            goto ERROR;
         }
      }
      /*   AR_queue_list, SGE_LIST */
      if (!qref_list_is_valid(lGetList(ar, AR_queue_list), alpp)) {
         goto ERROR;
      }
      /*   AR_mail_options, SGE_ULONG   */
      /*   AR_mail_list, SGE_LIST */
      
      /*   AR_master_queue_list  -masterq wc_queue_list, SGE_LIST bind master task to queue(s) */
      if (!qref_list_is_valid(lGetList(ar, AR_master_queue_list), alpp)) {
         goto ERROR;
      }
       
      
      /*   AR_pe, SGE_STRING,  AR_pe_range, SGE_LIST */
      NULL_OUT_NONE(ar, AR_pe);
      {
         const char *pe_name = NULL;
         lList *pe_range = NULL;
         
         pe_name = lGetString(ar, AR_pe);
         if (pe_name) {
            const lListElem *pep;
            pep = pe_list_find_matching(*object_base[SGE_TYPE_PE].list, pe_name);
            if (!pep) {
               ERROR((SGE_EVENT, MSG_JOB_PEUNKNOWN_S, pe_name));
               answer_list_add(alpp, SGE_EVENT, STATUS_EUNKNOWN, ANSWER_QUALITY_ERROR);
               goto ERROR;
            }
            /* check pe_range */
            pe_range = lGetList(ar, AR_pe_range);
            if (object_verify_pe_range(alpp, pe_name, pe_range, SGE_OBJ_AR)!=STATUS_OK) {
               goto ERROR;
            }
         }
      }

      /*   AR_acl_list, SGE_LIST */
      if (userset_list_validate_access(lGetList(ar, AR_acl_list), ARA_name, alpp) != STATUS_OK) {
         goto ERROR;
      }
      
      /*   AR_xacl_list, SGE_LIST */
      if (userset_list_validate_access(lGetList(ar, AR_xacl_list), ARA_name, alpp) != STATUS_OK) {
         goto ERROR;
      }

      if (is_spool) {
         lListElem *jg;
         dstring cqueue_buffer = DSTRING_INIT;
         dstring hostname_buffer = DSTRING_INIT;
         for_each(jg, lGetList(ar, AR_granted_slots)){
            const char *hostname = NULL;
            const char *qname = lGetString(jg, JG_qname);
            bool has_hostname = false;
            bool has_domain = false;

            cqueue_name_split(qname, &cqueue_buffer, &hostname_buffer,
                              &has_hostname, &has_domain);
            hostname = sge_dstring_get_string(&hostname_buffer);
            lSetHost(jg, JG_qhostname, hostname);
         }
         sge_dstring_free(&cqueue_buffer);
         sge_dstring_free(&hostname_buffer);
      }
      /*   AR_type,  SGE_ULONG     */
      /*   AR_state, SGE_ULONG               state of the AR */
      if(lGetUlong(ar, AR_state) == ARL_UNKNOWN){
         lSetUlong(ar, AR_state, ARL_CREATION);  
      }
   }
   DRETURN(true);

ERROR:
   DRETURN(false);
}
Example #12
0
/*----------------------------------------------------------------------------*/
int 
main(int argc, char **argv)
{
   int heartbeat        = 0;
   int last_heartbeat   = 0;
   int latest_heartbeat = 0;
   int ret              = 0;
   int delay            = 0;
   time_t now, last;
/*    const char *cp; */
   char err_str[MAX_STRING_SIZE];
   char shadowd_pidfile[SGE_PATH_MAX];
   dstring ds;
   char buffer[256];
   pid_t shadowd_pid;

#if 1

static int check_interval = CHECK_INTERVAL;
static int get_active_interval = GET_ACTIVE_INTERVAL;
static int delay_time = DELAY_TIME;
static int sge_test_heartbeat = 0;

char binpath[SGE_PATH_MAX];
char oldqmaster[SGE_PATH_MAX];

char shadow_err_file[SGE_PATH_MAX];
char qmaster_out_file[SGE_PATH_MAX];

#endif

   lList *alp = NULL;
   sge_gdi_ctx_class_t *ctx = NULL;

   DENTER_MAIN(TOP_LAYER, "sge_shadowd");
   
   sge_dstring_init(&ds, buffer, sizeof(buffer));
   /* initialize recovery control variables */
   {
      char *s;
      int val;
      if ((s=getenv("SGE_CHECK_INTERVAL")) &&
          sscanf(s, "%d", &val) == 1)
         check_interval = val;
      if ((s=getenv("SGE_GET_ACTIVE_INTERVAL")) &&
          sscanf(s, "%d", &val) == 1)
         get_active_interval = val;
      if ((s=getenv("SGE_DELAY_TIME")) &&
          sscanf(s, "%d", &val) == 1)
         delay_time = val;
      if ((s=getenv("SGE_TEST_HEARTBEAT_TIMEOUT")) &&
          sscanf(s, "%d", &val) == 1)
         sge_test_heartbeat = val;
   }
         
   /* This needs a better solution */
   umask(022);

#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   /* init language output for gettext() , it will use the right language */
   sge_init_language_func((gettext_func_type)        gettext,
                         (setlocale_func_type)      setlocale,
                         (bindtextdomain_func_type) bindtextdomain,
                         (textdomain_func_type)     textdomain);
   sge_init_language(NULL,NULL);   
#endif /* __SGE_COMPILE_WITH_GETTEXT__  */

   log_state_set_log_file(TMP_ERR_FILE_SHADOWD);

   if (sge_setup2(&ctx, SHADOWD, MAIN_THREAD, &alp, false) != AE_OK) {
      answer_list_output(&alp);
      SGE_EXIT((void**)&ctx, 1);
   }

   /* AA: TODO: change this */
   ctx->set_exit_func(ctx, shadowd_exit_func);
   sge_setup_sig_handlers(SHADOWD);
   
#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   if (ctx->get_qmaster_spool_dir(ctx) != NULL) {
      char *shadowd_name = SGE_SHADOWD;

      /* is there a running shadowd on this host (with unqualified name) */
      sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), 
              ctx->get_unqualified_hostname(ctx));

      DPRINTF(("pidfilename: %s\n", shadowd_pidfile));
      if ((shadowd_pid = sge_readpid(shadowd_pidfile))) {
         DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid)));
         if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) {
            CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid));
            SGE_EXIT((void**)&ctx, 1);
         }
      }

      ctx->prepare_enroll(ctx);

      /* is there a running shadowd on this host (with aliased name) */
      sprintf(shadowd_pidfile, "%s/"SHADOWD_PID_FILE, ctx->get_qmaster_spool_dir(ctx), 
              ctx->get_qualified_hostname(ctx));
      DPRINTF(("pidfilename: %s\n", shadowd_pidfile));
      if ((shadowd_pid = sge_readpid(shadowd_pidfile))) {
         DPRINTF(("shadowd_pid: "sge_U32CFormat"\n", sge_u32c(shadowd_pid)));
         if (!sge_checkprog(shadowd_pid, shadowd_name, PSCMD)) {
            CRITICAL((SGE_EVENT, MSG_SHADOWD_FOUNDRUNNINGSHADOWDWITHPIDXNOTSTARTING_I, (int) shadowd_pid));
            SGE_EXIT((void**)&ctx, 1);
         }
      }  
   } else {
      ctx->prepare_enroll(ctx);
   }

   if (parse_cmdline_shadowd(argc, argv) == 1) {
      SGE_EXIT((void**)&ctx, 0);
   }
   
   if (ctx->get_qmaster_spool_dir(ctx) == NULL) {
      CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTREADQMASTERSPOOLDIRFROMX_S, ctx->get_bootstrap_file(ctx)));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (chdir(ctx->get_qmaster_spool_dir(ctx))) {
      CRITICAL((SGE_EVENT, MSG_SHADOWD_CANTCHANGETOQMASTERSPOOLDIRX_S, ctx->get_qmaster_spool_dir(ctx)));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (sge_set_admin_username(ctx->get_admin_user(ctx), err_str)) {
      CRITICAL((SGE_EVENT, SFNMAX, err_str));
      SGE_EXIT((void**)&ctx, 1);
   }

   if (sge_switch2admin_user()) {
      CRITICAL((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSWITCHTOADMIN_USER));
      SGE_EXIT((void**)&ctx, 1);
   }

   sprintf(shadow_err_file, "messages_shadowd.%s", ctx->get_unqualified_hostname(ctx));
   sprintf(qmaster_out_file, "messages_qmaster.%s", ctx->get_unqualified_hostname(ctx));
   sge_copy_append(TMP_ERR_FILE_SHADOWD, shadow_err_file, SGE_MODE_APPEND);
   unlink(TMP_ERR_FILE_SHADOWD);
   log_state_set_log_as_admin_user(1);
   log_state_set_log_file(shadow_err_file);

   {
      int* tmp_fd_array = NULL;
      unsigned long tmp_fd_count = 0;

      if (cl_com_set_handle_fds(cl_com_get_handle(prognames[SHADOWD] ,0), &tmp_fd_array, &tmp_fd_count) == CL_RETVAL_OK) {
         sge_daemonize(tmp_fd_array, tmp_fd_count, ctx);
         if (tmp_fd_array != NULL) {
            sge_free(&tmp_fd_array);
         }
      } else {
         sge_daemonize(NULL, 0, ctx);
      }
   }

   /* shadowd pid file will contain aliased name */
   sge_write_pid(shadowd_pidfile);

   starting_up();
   
   sge_setup_sig_handlers(SHADOWD);

   last_heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30);

   last = (time_t) sge_get_gmt(); /* set time of last check time */

   delay = 0;
   while (!shut_me_down) {
      sleep(check_interval);

      /* get current heartbeat file content */
      heartbeat = get_qmaster_heartbeat(QMASTER_HEARTBEAT_FILE, 30);

      now = (time_t) sge_get_gmt();


      /* Only check when we could read the heartbeat file at least two times
       * (last_heartbeat and heartbeat) without error 
       */
      if (last_heartbeat > 0 && heartbeat > 0) {

         /*
          * OK we have to heartbeat entries to check. Check times ...
          * now  = current time
          * last = last check time
          */
         if ( (now - last) >= (get_active_interval + delay) ) {

            delay = 0;
            if (last_heartbeat == heartbeat) {
               DPRINTF(("heartbeat not changed since seconds: "sge_U32CFormat"\n", sge_u32c(now - last)));
               delay = delay_time; /* set delay time */

               /*
                * check if we are a possible new qmaster host (lock file of qmaster active, etc.)
                */
               ret = check_if_valid_shadow(binpath, oldqmaster, 
                                           ctx->get_act_qmaster_file(ctx), 
                                           ctx->get_shadow_master_file(ctx), 
                                           ctx->get_qualified_hostname(ctx), 
                                           ctx->get_binary_path(ctx));

               if (ret == 0) {
                  /* we can start a qmaster on this host */
                  if (qmaster_lock(QMASTER_LOCK_FILE)) {
                     ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_FAILEDTOLOCKQMASTERSOMBODYWASFASTER));
                  } else {
                     int out, err;

                     /* still the old qmaster name in act_qmaster file and still the old heartbeat */
                     latest_heartbeat = get_qmaster_heartbeat( QMASTER_HEARTBEAT_FILE, 30);
                     /* TODO: what do we when there is a timeout ??? */
                     DPRINTF(("old qmaster name in act_qmaster and old heartbeat\n"));
                     if (!compare_qmaster_names(ctx->get_act_qmaster_file(ctx), oldqmaster) &&
                         !shadowd_is_old_master_enrolled(sge_test_heartbeat, sge_get_qmaster_port(NULL), oldqmaster) && 
                         (latest_heartbeat == heartbeat)) {
                        char qmaster_name[256];

                        strcpy(qmaster_name, SGE_PREFIX);
                        strcat(qmaster_name, prognames[QMASTER]); 
                        DPRINTF(("qmaster_name: "SFN"\n", qmaster_name)); 

                        /*
                         * open logfile as admin user for initial qmaster/schedd 
                         * startup messages
                         */
                        out = SGE_OPEN3(qmaster_out_file, O_CREAT|O_WRONLY|O_APPEND, 
                                   0644);
                        err = out;
                        if (out == -1) {
                           /*
                            * First priority is the master restart
                            * => ignore this error
                            */
                           out = 1;
                           err = 2;
                        } 

                        sge_switch2start_user();
                        ret = startprog(out, err, NULL, binpath, qmaster_name, NULL);
                        sge_switch2admin_user();
                        if (ret) {
                           ERROR((SGE_EVENT, SFNMAX, MSG_SHADOWD_CANTSTARTQMASTER));
                        }
                        close(out);
                     } else {
                        qmaster_unlock(QMASTER_LOCK_FILE);
                     }
                  }      
               } else {
                  if (ret == -1) {
                     /* just log the more important failures */    
                     WARNING((SGE_EVENT, MSG_SHADOWD_DELAYINGSHADOWFUNCFORXSECONDS_U, sge_u32c(delay) ));
                  }
               } 
            }
            /* Begin a new interval, set timers and hearbeat to current values */
            last = now;
            last_heartbeat = heartbeat;
         }
      } else {
         if (last_heartbeat < 0 || heartbeat < 0) {
            /* There was an error reading heartbeat or last_heartbeat */
            DPRINTF(("can't read heartbeat file. last_heartbeat="sge_U32CFormat", heartbeat="sge_U32CFormat"\n",
                     sge_u32c(last_heartbeat), sge_u32c(heartbeat)));
         } else {
            DPRINTF(("have to read the heartbeat file twice to check time differences\n"));
         }
      }
   }

   sge_shutdown((void**)&ctx, 0);

   DRETURN(EXIT_SUCCESS);
}
/****** uti/host/sge_gethostbyaddr() ****************************************
*  NAME
*     sge_gethostbyaddr() -- gethostbyaddr() wrapper
*
*  SYNOPSIS
*     struct hostent *sge_gethostbyaddr(const struct in_addr *addr, int* system_error_retval)
*
*  FUNCTION
*     Wraps gethostbyaddr() function calls, measures time spent
*     in gethostbyaddr() and logs when very much time has passed.
*     On error, return error code in *system_error_retval if that is non-null.
*
*     return value must be released by function caller (don't forget the 
*     char** array lists inside of struct hostent)
*
*     If possible (libcomm linked) use  cl_com_cached_gethostbyaddr() 
*     from libcomm. This will return an sge aliased hostname.
*
*
*  NOTES
*     MT-NOTE: sge_gethostbyaddr() is MT safe
*     MT-NOTE: sge_gethostbyaddr() uses a mutex to guard access to the
*     MT-NOTE: gethostbyaddr() system call on all platforms other than Solaris,
*     MT-NOTE: Linux, and HP-UX.  Therefore, except on the aforementioned
*     MT-NOTE: platforms, MT calls to gethostbyaddr() must go through
*     MT-NOTE: sge_gethostbyaddr() to be MT safe.
*******************************************************************************/
struct hostent *sge_gethostbyaddr(const struct in_addr *addr, int* system_error_retval)
{
   struct hostent *he = NULL;
   time_t now;
   time_t time;
   int l_errno;

   DENTER(TOP_LAYER, "sge_gethostbyaddr");

   /* This method goes to great lengths to slip a reentrant gethostbyaddr into
    * the code without making changes to the rest of the source base.  That
    * basically means that we have to make some redundant copies to
    * return to the caller.  This method doesn't appear to be highly utilized,
    * so that's probably ok.  If it's not ok, the interface can be changed
    * later. */

   gethostbyaddr_calls++;      /* profiling */
   now = (time_t)sge_get_gmt();

#ifdef GETHOSTBYADDR_R8
#define SGE_GETHOSTBYADDR_FOUND
   /* This is for Linux */
   DPRINTF (("Getting host by addr - Linux\n"));
   {
      struct hostent re;
      char buffer[4096];

      /* No need to malloc he because it will end up pointing to re. */
      gethostbyaddr_r ((const char *)addr, 4, AF_INET, &re, buffer, 4096, &he, &l_errno);
      
      /* Since re contains pointers into buffer, and both re and the buffer go
       * away when we exit this code block, we make a deep copy to return. */
      /* Yes, I do mean to check if he is NULL and then copy re!  No, he
       * doesn't need to be freed first. */
      if (he != NULL) {
         he = sge_copy_hostent (&re);
      }
   }
#endif
#ifdef GETHOSTBYADDR_R7
#define SGE_GETHOSTBYADDR_FOUND
   /* This is for Solaris */
   DPRINTF(("Getting host by addr - Solaris\n"));
   {
      char buffer[4096];
      struct hostent *help_he = NULL;
      he = (struct hostent *)malloc(sizeof(struct hostent));
      if (he != NULL) {
         memset(he, 0, sizeof(struct hostent));

         /* On Solaris, this function returns the pointer to my struct on success
          * and NULL on failure. */
         help_he = gethostbyaddr_r((const char *)addr, 4, AF_INET, he, buffer, 4096, &l_errno);
      
         /* Since he contains pointers into buffer, and buffer goes away when we
          * exit this code block, we make a deep copy to return. */
         if (help_he != NULL) {
            struct hostent *new_he = sge_copy_hostent(help_he);
            sge_free(&he);
            he = new_he;
         } else {
            sge_free(&he);
         }
      }
   }
#endif

#ifdef GETHOSTBYADDR_R5
#define SGE_GETHOSTBYADDR_FOUND
   /* This is for HPUX < 11 */
   DPRINTF(("Getting host by addr - 3 arg\n"));
   
   {
      struct hostent_data he_data;
     
      memset(&he_data, 0, sizeof(he_data));
      he = (struct hostent *)malloc (sizeof (struct hostent));
      if (he != NULL) {
         memset(he, 0, sizeof(struct hostent));
         if (gethostbyaddr_r ((const char *)addr, 4, AF_INET, he, &he_data) < 0) {
            /* If this function fails, free he so that we can test if it's NULL
             * later in the code. */
            sge_free(&he);
         }
         /* The location of the error code is actually undefined.  I'm just
          * assuming that it's in h_errno since that's where it is in the unsafe
          * version.
          * h_errno is, of course, not thread safe, but if there's an error we're
          * already screwed, so we won't worry to much about it.
          * An alternative would be to set errno to HOST_NOT_FOUND. */
         l_errno = h_errno;
         
         /* Since he contains pointers into he_data, and he_data goes away when we
          * exit this code block, we make a deep copy to return. */
         if (he != NULL) {
            struct hostent *new_he = sge_copy_hostent (he);
            sge_free(&he);
            he = new_he;
         }
      }
   }
#endif
#ifdef GETHOSTBYADDR
#define SGE_GETHOSTBYADDR_FOUND
   /* This is for HPUX >= 11 */
   DPRINTF(("Getting host by addr - Thread safe\n"));
   he = gethostbyaddr((const char *)addr, 4, AF_INET);
   /*
    * JG: TODO: shouldn't it be 
    * he = gethostbyaddr((const char *)addr, sizeof(struct in_addr), AF_INET);
    */

   /* The location of the error code is actually undefined.  I'm just
    * assuming that it's in h_errno since that's where it is in the unsafe
    * version.
    * h_errno is, of course, not thread safe, but if there's an error we're
    * already screwed, so we won't worry too much about it.
    * An alternative would be to set errno to HOST_NOT_FOUND. */
   l_errno = h_errno;
   if (he != NULL) {
      struct hostent *new_he = sge_copy_hostent(he);
      /* do not free he, there was no malloc() */
      he = new_he;
   }
#endif


#ifdef GETHOSTBYADDR_M
#define SGE_GETHOSTBYADDR_FOUND
   /* This is for everyone else. */
   DPRINTF (("Getting host by addr - Mutex guarded\n"));
   
   sge_mutex_lock("hostbyaddr", SGE_FUNC, __LINE__, &hostbyaddr_mutex);

   /* JG: TODO: shouldn't it always be sizeof(struct in_addr)? */
   he = gethostbyaddr((const char *)addr, 4, AF_INET);

   l_errno = h_errno;
   if (he != NULL) {
      struct hostent *new_he = sge_copy_hostent(he);
      /* do not free he, there was no malloc() */
      he = new_he;
   }
   sge_mutex_unlock("hostbyaddr", SGE_FUNC, __LINE__, &hostbyaddr_mutex);
#endif

#ifndef SGE_GETHOSTBYADDR_FOUND
#error "no sge_gethostbyaddr() definition for this architecture."
#endif
   time = (time_t)sge_get_gmt() - now;
   gethostbyaddr_sec += time;   /* profiling */

   /* warn about blocking gethostbyaddr() calls */
   if (time > MAX_RESOLVER_BLOCKING) {
      WARNING((SGE_EVENT, "gethostbyaddr() took %d seconds and returns %s", (int)time, he?"success":
          (l_errno == HOST_NOT_FOUND)?"HOST_NOT_FOUND":
          (l_errno == TRY_AGAIN)?"TRY_AGAIN":
          (l_errno == NO_RECOVERY)?"NO_RECOVERY":
          (l_errno == NO_DATA)?"NO_DATA":
          (l_errno == NO_ADDRESS)?"NO_ADDRESS":"<unknown error>"));
   }
   if (system_error_retval != NULL) {
      *system_error_retval = l_errno;
   }

   DEXIT;
   return he;
}
Example #14
0
/****** gdi/sge/sge_qexecve() ************************************************
*  NAME
*     sge_qexecve() -- start a task in a tightly integrated par. job
*
*  SYNOPSIS
*     sge_tid_t sge_qexecve(const char *hostname, const char *queuename,
*                           const char *cwd, const lList *environment
*                           const lList *path_aliases)
*
*  FUNCTION
*     Starts a task in a tightly integrated job.
*     Builds a job object describing the task,
*     connects to the commd on the targeted execution host,
*     deliveres the job object and waits for an answer.
*     The answer from the execution daemon on the execution host
*     contains a task id that is returned to the caller of the function.
*
*  INPUTS
*     const char *hostname - name of the host on which to start the task
*     const lList *environment  - list containing environment variable
*                            settings for the task that override the
*                            default environment
*     const lList *path_aliases - optional a path alias list
*
*  RESULT
*     sge_tid_t - the task id, if the task can be executed,
*                 a value <= 0 indicates an error.
*
*  NOTES
*     MT-NOTE: sge_qexecve() is not MT safe
******************************************************************************/
sge_tid_t sge_qexecve(sge_gdi_ctx_class_t *ctx,
                      const char *hostname, const char *queuename,
                      const char *cwd, const lList *environment,
                      const lList *path_aliases)
{
    char myname[256];
    const char *s;
    int ret, uid;
    sge_tid_t tid = NULL;
    lListElem *petrep;
    lListElem *rt;
    sge_pack_buffer pb;
    u_long32 jobid, jataskid;
    u_long32 dummymid = 0;
    const char *env_var_name = "SGE_TASK_ID";

    DENTER(TOP_LAYER, "sge_qexecve");

    if (hostname == NULL) {
        sprintf(lasterror, MSG_GDI_INVALIDPARAMETER_SS, "sge_qexecve", "hostname");
        DRETURN(NULL);
    }

    /* resolve user */
    if (sge_uid2user((uid=getuid()), myname, sizeof(myname)-1, MAX_NIS_RETRIES)) {
        sprintf(lasterror, MSG_GDI_RESOLVINGUIDTOUSERNAMEFAILED_IS ,
                uid, strerror(errno));
        DRETURN(NULL);
    }

    if ((s=getenv("JOB_ID")) == NULL) {
        sprintf(lasterror, MSG_GDI_MISSINGINENVIRONMENT_S, "JOB_ID");
        DRETURN(NULL);
    }

    if (sscanf(s, sge_u32, &jobid) != 1) {
        sprintf(lasterror, MSG_GDI_STRINGISINVALID_SS, s, "JOB_ID");
        DRETURN(NULL);
    }

    if ((s=getenv(env_var_name)) != NULL) {
        if (strcmp(s, "undefined") == 0) {
            jataskid = 1;
        } else {
            if (sscanf(s, sge_u32, &jataskid) != 1) {
                sprintf(lasterror, MSG_GDI_STRINGISINVALID_SS, s, env_var_name);
                DRETURN(NULL);
            }
        }
    } else {
        sprintf(lasterror, MSG_GDI_MISSINGINENVIRONMENT_S, env_var_name);
        DRETURN(NULL);
    }

    /* ---- build up pe task request structure (see gdilib/sge_petaskL.h) */
    petrep = lCreateElem(PETR_Type);

    lSetUlong(petrep, PETR_jobid, jobid);
    lSetUlong(petrep, PETR_jataskid, jataskid);
    lSetString(petrep, PETR_owner, myname);
    lSetUlong(petrep, PETR_submission_time, sge_get_gmt());

    if (cwd != NULL) {
        lSetString(petrep, PETR_cwd, cwd);
    }

    if (environment != NULL) {
        lSetList(petrep, PETR_environment, lCopyList("environment", environment));
    }

    if (path_aliases != NULL) {
        lSetList(petrep, PETR_path_aliases, lCopyList("path_aliases", path_aliases));
    }


    if (queuename != NULL) {
        lSetString(petrep, PETR_queuename, queuename);
    }

    if (init_packbuffer(&pb, 1024, 0) != PACK_SUCCESS) {
        lFreeElem(&petrep);
        sprintf(lasterror, SFNMAX, MSG_GDI_OUTOFMEMORY);
        DRETURN(NULL);
    }

    pack_job_delivery(&pb, petrep);

    ret = gdi2_send_message_pb(ctx, 1, prognames[EXECD], 1, hostname,
                               TAG_JOB_EXECUTION, &pb, &dummymid);

    clear_packbuffer(&pb);

    lFreeElem(&petrep);

    if (ret != CL_RETVAL_OK) {
        sprintf(lasterror, MSG_GDI_SENDTASKTOEXECDFAILED_SS, hostname, cl_get_error_text(ret));
        DRETURN(NULL);
    }

    /* add list into our remote task list */
    rt = lAddElemStr(&remote_task_list, RT_tid, "none", RT_Type);
    lSetHost(rt, RT_hostname, hostname);
    lSetUlong(rt, RT_state, RT_STATE_WAIT4ACK);

    rcv_from_execd(ctx, OPT_SYNCHRON, TAG_JOB_EXECUTION);

    tid = (sge_tid_t) lGetString(rt, RT_tid);

    if (strcmp(tid, "none") == 0) {
        tid = NULL;
        sprintf(lasterror, MSG_GDI_EXECDONHOSTDIDNTACCEPTTASK_S, hostname);
    }

    /* now close message to execd */
    cl_commlib_shutdown_handle(cl_com_get_handle("execd_handle", 0), false);

    DRETURN(tid);
}
Example #15
0
/************************************************************************
 Master routine for job exit

 We need a rusage struct filled.
 In normal cases this is done by the execd, sending this structure
 to notify master about job finish.

 In case of an error noticed by the master which needs the job to be 
 removed we can fill this structure by hand. We need:

 rusage->job_number
 rusage->qname to clean up the queue (if we didn't find it we nevertheless
               clean up the job

 for functions regarding rusage see sge_rusage.c
 ************************************************************************/
void sge_job_exit(sge_gdi_ctx_class_t *ctx, lListElem *jr, lListElem *jep, lListElem *jatep, monitoring_t *monitor) 
{
   lListElem *queueep = NULL;
   const char *err_str = NULL;
   const char *qname = NULL;  
   const char *hostname = MSG_OBJ_UNKNOWNHOST;
   u_long32 jobid, jataskid;
   lListElem *hep = NULL;
   u_long32 timestamp;
   object_description *object_base = object_type_get_object_description();

   u_long32 failed, general_failure;
   lList *saved_gdil;

   DENTER(TOP_LAYER, "sge_job_exit");

   /* JG: TODO: we'd prefer some more precise timestamp, e.g. from jr */
   timestamp = sge_get_gmt();
                     
   qname = lGetString(jr, JR_queue_name);
   if (qname == NULL) {
      qname = (char *)MSG_OBJ_UNKNOWNQ;
   }

   err_str = lGetString(jr, JR_err_str);
   if (err_str == NULL) {
      err_str = MSG_UNKNOWNREASON;
   }

   jobid = lGetUlong(jr, JR_job_number);
   jataskid = lGetUlong(jr, JR_ja_task_number);
   failed = lGetUlong(jr, JR_failed);
   general_failure = lGetUlong(jr, JR_general_failure);

   cancel_job_resend(jobid, jataskid);

   /* This only has a meaning for Hibernator jobs. The job pid must
    * be saved accross restarts, since jobs get their old pid
    */
   lSetUlong(jatep, JAT_pvm_ckpt_pid, lGetUlong(jr, JR_job_pid));

   DPRINTF(("reaping job "sge_u32"."sge_u32" in queue >%s< job_pid %d\n", 
      jobid, jataskid, qname, (int) lGetUlong(jatep, JAT_pvm_ckpt_pid)));

   queueep = cqueue_list_locate_qinstance(*object_base[SGE_TYPE_CQUEUE].list, qname);
   if (queueep == NULL) {
      ERROR((SGE_EVENT, MSG_JOB_WRITEJFINISH_S, qname));
   }

   sge_job_remove_enforce_limit_trigger(jobid, jataskid);

   /* retrieve hostname for later use */
   if (queueep != NULL) {
      hostname = lGetHost(queueep, QU_qhostname);
   }

   if (failed) {        /* a problem occured */
      WARNING((SGE_EVENT, MSG_JOB_FAILEDONHOST_UUSSSS, sge_u32c(jobid), 
               sge_u32c(jataskid), 
               hostname,
               general_failure ? MSG_GENERAL : "",
               get_sstate_description(failed), err_str));
   } else {
      INFO((SGE_EVENT, MSG_JOB_JFINISH_UUS,  sge_u32c(jobid), sge_u32c(jataskid), hostname));
   }

   /*-------------------------------------------------*/

   /* test if this job is in state JRUNNING or JTRANSFERING */
   if (lGetUlong(jatep, JAT_status) != JRUNNING && 
       lGetUlong(jatep, JAT_status) != JTRANSFERING) {
      ERROR((SGE_EVENT, MSG_JOB_JEXITNOTRUN_UU, sge_u32c(lGetUlong(jep, JB_job_number)), sge_u32c(jataskid)));
      DRETURN_VOID;
   }

   saved_gdil = lCopyList("cpy", lGetList(jatep, JAT_granted_destin_identifier_list));

   /*
    * case 1: job being trashed because 
    *    --> failed starting interactive job
    *    --> job was deleted
    *    --> a failed batch job that explicitely shall not enter error state
    */
   if (((lGetUlong(jatep, JAT_state) & JDELETED) == JDELETED) ||
         (failed && !lGetString(jep, JB_exec_file)) ||
         (failed && general_failure==GFSTATE_JOB && JOB_TYPE_IS_NO_ERROR(lGetUlong(jep, JB_type)))) {
      reporting_create_acct_record(ctx, NULL, jr, jep, jatep, false);
      /* JG: TODO: we need more information in the log message */
      reporting_create_job_log(NULL, timestamp, JL_DELETED, MSG_EXECD, hostname, jr, jep, jatep, NULL, MSG_LOG_JREMOVED);

      sge_commit_job(ctx, jep, jatep, jr, COMMIT_ST_FINISHED_FAILED_EE, COMMIT_DEFAULT | COMMIT_NEVER_RAN, monitor);

      if (lGetUlong(jep, JB_ar) != 0 && (lGetUlong(jatep, JAT_state) & JDELETED) == JDELETED) {
         /* get AR and remove it if no other jobs are debited */
         lList *master_ar_list = *object_base[SGE_TYPE_AR].list;
         lListElem *ar = ar_list_locate(master_ar_list, lGetUlong(jep, JB_ar));

         if (ar != NULL && lGetUlong(ar, AR_state) == AR_DELETED) {
            lListElem *ar_queue;
            u_long32 ar_id = lGetUlong(ar, AR_id);

            for_each(ar_queue, lGetList(ar, AR_reserved_queues)) {
               if (qinstance_slots_used(ar_queue) != 0) {
                  break;
               }
            }
            if (ar_queue == NULL) {
               /* no jobs registered in advance reservation */
               dstring buffer = DSTRING_INIT;

               sge_dstring_sprintf(&buffer, sge_U32CFormat,
                                   sge_u32c(ar_id));

               ar_do_reservation(ar, false);

               reporting_create_ar_log_record(NULL, ar, ARL_DELETED, 
                                              "AR deleted",
                                              timestamp);
               reporting_create_ar_acct_records(NULL, ar, timestamp); 

               lRemoveElem(master_ar_list, &ar);

               sge_event_spool(ctx, NULL, 0, sgeE_AR_DEL, 
                      ar_id, 0, sge_dstring_get_string(&buffer), NULL, NULL,
                      NULL, NULL, NULL, true, true);
               sge_dstring_free(&buffer);
            }
         }
Example #16
0
static void qevent_testsuite_mode(sge_evc_class_t *evc) 
{
#ifndef QEVENT_SHOW_ALL
   u_long32 timestamp;
   lCondition *where =NULL;
   lEnumeration *what = NULL;
 
   const int job_nm[] = {       
         JB_job_number,
         JB_host,
         JB_category,            
         JB_project, 
         JB_ja_tasks,
         JB_ja_structure,
         JB_ja_n_h_ids,
         JB_ja_u_h_ids,
         JB_ja_s_h_ids,
         JB_ja_o_h_ids,   
         JB_ja_template,
         NoName
      };

   const int jat_nm[] = {     
      JAT_status, 
      JAT_task_number,
      NoName
   };  
#endif
   
   DENTER(TOP_LAYER, "qevent_testsuite_mode");

   sge_mirror_initialize(evc, EV_ID_ANY, "qevent", true,
                         NULL, NULL, NULL, NULL, NULL);

#ifdef QEVENT_SHOW_ALL
   sge_mirror_subscribe(evc, SGE_TYPE_ALL, print_event, NULL, NULL, NULL, NULL);
#else /* QEVENT_SHOW_ALL */
   where = NULL; 
   what =  lIntVector2What(JB_Type, job_nm); 
   sge_mirror_subscribe(evc, SGE_TYPE_JOB, print_jatask_event, NULL, NULL, where, what);
   lFreeWhere(&where);
   lFreeWhat(&what);
   
   where = NULL; 
   what = lIntVector2What(JAT_Type, jat_nm); 
   sge_mirror_subscribe(evc, SGE_TYPE_JATASK, print_jatask_event, NULL, NULL, where, what);
   lFreeWhere(&where);
   lFreeWhat(&what);
 
   /* we want a 5 second event delivery interval */
   evc->ec_set_edtime(evc, 5);

   /* and have our events flushed immediately */
   evc->ec_set_flush(evc, sgeE_JATASK_MOD, true, 1);
   evc->ec_set_flush(evc, sgeE_JOB_FINAL_USAGE, true, 1);
   evc->ec_set_flush(evc, sgeE_JOB_ADD, true, 1);
   evc->ec_set_flush(evc, sgeE_JOB_DEL, true, 1);

#endif /* QEVENT_SHOW_ALL */
   
   while (!shut_me_down) {
      sge_mirror_error error = sge_mirror_process_events(evc);
      if (error == SGE_EM_TIMEOUT && !shut_me_down) {
         sleep(10);
         continue;
      }

#ifndef QEVENT_SHOW_ALL
      timestamp = sge_get_gmt();
      fprintf(stdout,"ECL_STATE (jobs_running=%ld:jobs_registered=%ld:ECL_TIME="sge_U32CFormat")\n",
              Global_jobs_running,Global_jobs_registered,sge_u32c(timestamp));
      fflush(stdout);  
#endif
   }

   sge_mirror_shutdown(evc);

   DEXIT;
}
Example #17
0
/*-------------------------------------------------------------------------*/
int main(int argc, char **argv)
{
   int ret;
   int my_pid;
   int ret_val;
   int printed_points = 0;
   int max_enroll_tries;
   static char tmp_err_file_name[SGE_PATH_MAX];
   time_t next_prof_output = 0;
   int execd_exit_state = 0;
   lList **master_job_list = NULL;
   sge_gdi_ctx_class_t *ctx = NULL;
   lList *alp = NULL;

   DENTER_MAIN(TOP_LAYER, "execd");

#if defined(LINUX)
   gen_procList ();
#endif

   prof_mt_init();

   set_thread_name(pthread_self(),"Execd Thread");

   prof_set_level_name(SGE_PROF_CUSTOM1, "Execd Thread", NULL); 
   prof_set_level_name(SGE_PROF_CUSTOM2, "Execd Dispatch", NULL); 

#ifdef __SGE_COMPILE_WITH_GETTEXT__  
   /* init language output for gettext() , it will use the right language */
   sge_init_language_func((gettext_func_type)        gettext,
                         (setlocale_func_type)      setlocale,
                         (bindtextdomain_func_type) bindtextdomain,
                         (textdomain_func_type)     textdomain);
   sge_init_language(NULL,NULL);   
#endif /* __SGE_COMPILE_WITH_GETTEXT__  */

   /* This needs a better solution */
   umask(022);
      
   /* Initialize path for temporary logging until we chdir to spool */
   my_pid = getpid();
   sprintf(tmp_err_file_name,"%s."sge_U32CFormat"", TMP_ERR_FILE_EXECD, sge_u32c(my_pid));
   log_state_set_log_file(tmp_err_file_name);

   /* exit func for SGE_EXIT() */
   sge_sig_handler_in_main_loop = 0;
   sge_setup_sig_handlers(EXECD);

   if (sge_setup2(&ctx, EXECD, MAIN_THREAD, &alp, false) != AE_OK) {
      answer_list_output(&alp);
      SGE_EXIT((void**)&ctx, 1);
   }
   ctx->set_exit_func(ctx, execd_exit_func);
   
#if defined(SOLARIS)
   /* Init shared SMF libs if necessary */
   if (sge_smf_used() == 1 && sge_smf_init_libs() != 0) {
       SGE_EXIT((void**)&ctx, 1);
   }
#endif

   /* prepare daemonize */
   if (!getenv("SGE_ND")) {
      sge_daemonize_prepare(ctx);
   }

   if ((ret=sge_occupy_first_three())>=0) {
      CRITICAL((SGE_EVENT, MSG_FILE_REDIRECTFD_I, ret));
      SGE_EXIT((void**)&ctx, 1);
   }

   lInit(nmv);

   /* unset XAUTHORITY if set */
   if (getenv("XAUTHORITY") != NULL) {
      sge_unsetenv("XAUTHORITY");
   }

   parse_cmdline_execd(argv);   
   
   /* exit if we can't get communication handle (bind port) */
   max_enroll_tries = 30;
   while (cl_com_get_handle(prognames[EXECD],1) == NULL) {
      ctx->prepare_enroll(ctx);
      max_enroll_tries--;

      if (max_enroll_tries <= 0 || shut_me_down) {
         /* exit after 30 seconds */
         if (printed_points != 0) {
            printf("\n");
         }
         CRITICAL((SGE_EVENT, MSG_COM_ERROR));
         SGE_EXIT((void**)&ctx, 1);
      }
      if (cl_com_get_handle(prognames[EXECD],1) == NULL) {
        /* sleep when prepare_enroll() failed */
        sleep(1);
        if (max_enroll_tries < 27) {
           printf(".");
           printed_points++;
           fflush(stdout);
        }
      }
   }

   if (printed_points != 0) {
      printf("\n");
   }

   /*
    * now the commlib up and running. Set execd application status function 
    * ( commlib callback function for qping status information response 
    *   messages (SIRM) )
    */
   ret_val = cl_com_set_status_func(sge_execd_application_status);
   if (ret_val != CL_RETVAL_OK) {
      ERROR((SGE_EVENT, cl_get_error_text(ret_val)) );
   }

   /* test connection */
   {
      cl_com_SIRM_t* status = NULL;
      ret_val = cl_commlib_get_endpoint_status(ctx->get_com_handle(ctx),
                                               (char *)ctx->get_master(ctx, true),
                                               (char*)prognames[QMASTER], 1, &status);
      if (ret_val != CL_RETVAL_OK) {
         ERROR((SGE_EVENT, cl_get_error_text(ret_val)));
         ERROR((SGE_EVENT, MSG_CONF_NOCONFBG));
      }
      cl_com_free_sirm_message(&status);
   }
   
   /* finalize daeamonize */
   if (!getenv("SGE_ND")) {
      sge_daemonize_finalize(ctx);
   }

   /* daemonizes if qmaster is unreachable */   
   sge_setup_sge_execd(ctx, tmp_err_file_name);

   /* are we using qidle or not */
   sge_ls_qidle(mconf_get_use_qidle());
   sge_ls_gnu_ls(1);
   
   DPRINTF(("use_qidle: %d\n", mconf_get_use_qidle()));

   /* test load sensor (internal or external) */
   {
      lList *report_list = sge_build_load_report(ctx->get_qualified_hostname(ctx), ctx->get_binary_path(ctx));
      lFreeList(&report_list);
   }
   
   /* here we have to wait for qmaster registration */
   while (sge_execd_register_at_qmaster(ctx, false) != 0) {
      if (sge_get_com_error_flag(EXECD, SGE_COM_ACCESS_DENIED, true)) {
         /* This is no error */
         DPRINTF(("*****  got SGE_COM_ACCESS_DENIED from qmaster  *****\n"));
      }
      if (sge_get_com_error_flag(EXECD, SGE_COM_ENDPOINT_NOT_UNIQUE, false)) {
         execd_exit_state = SGE_COM_ENDPOINT_NOT_UNIQUE;
         break;
      }
      if (shut_me_down != 0) {
         break;
      }
      sleep(30);
   }

   /* 
    * Terminate on SIGTERM or hard communication error
    */
   if (execd_exit_state != 0 || shut_me_down != 0) {
      sge_shutdown((void**)&ctx, execd_exit_state);
      DRETURN(execd_exit_state);
   }

   /*
    * We write pid file when we are connected to qmaster. Otherwise an old
    * execd might overwrite our pidfile.
    */
   sge_write_pid(EXECD_PID_FILE);

   /*
    * At this point we are sure we are the only sge_execd and we are connected
    * to the current qmaster. First we have to report any reaped children
    * that might exist.
    */
   starting_up();

   /*
    * Log a warning message if execd hasn't been started by a superuser
    */
   if (!sge_is_start_user_superuser()) {
      WARNING((SGE_EVENT, MSG_SWITCH_USER_NOT_ROOT));
   }   

#ifdef COMPILE_DC
   if (ptf_init()) {
      CRITICAL((SGE_EVENT, MSG_EXECD_NOSTARTPTF));
      SGE_EXIT((void**)&ctx, 1);
   }
   INFO((SGE_EVENT, MSG_EXECD_STARTPDCANDPTF));
#endif

   master_job_list = object_type_get_master_list(SGE_TYPE_JOB);
   *master_job_list = lCreateList("Master_Job_List", JB_Type);
   job_list_read_from_disk(master_job_list, "Master_Job_List",
                           0, SPOOL_WITHIN_EXECD, 
                          job_initialize_job);
   

   /* clean up jobs hanging around (look in active_dir) */
   clean_up_old_jobs(ctx, 1);
   execd_trash_load_report();
   sge_set_flush_lr_flag(true);

   sge_sig_handler_in_main_loop = 1;

   if (thread_prof_active_by_id(pthread_self())) {
      prof_start(SGE_PROF_CUSTOM1, NULL);
      prof_start(SGE_PROF_CUSTOM2, NULL);
      prof_start(SGE_PROF_GDI_REQUEST, NULL);
   } else {
      prof_stop(SGE_PROF_CUSTOM1, NULL);
      prof_stop(SGE_PROF_CUSTOM2, NULL);
      prof_stop(SGE_PROF_GDI_REQUEST, NULL);
   }

   PROF_START_MEASUREMENT(SGE_PROF_CUSTOM1);

   /* Start dispatching */
   execd_exit_state = sge_execd_process_messages(ctx);


   /*
    * This code is only reached when dispatcher terminates and execd goes down.
    */

   /* log if we received SIGPIPE signal */
   if (sge_sig_handler_sigpipe_received) {
       sge_sig_handler_sigpipe_received = 0;
       INFO((SGE_EVENT, "SIGPIPE received\n"));
   }

#if defined(LINUX)
   free_procList();
#endif
   lFreeList(master_job_list);

   PROF_STOP_MEASUREMENT(SGE_PROF_CUSTOM1);
   if (prof_is_active(SGE_PROF_ALL)) {
     time_t now = (time_t)sge_get_gmt();

      if (now > next_prof_output) {
         prof_output_info(SGE_PROF_ALL, false, "profiling summary:\n");
         prof_reset(SGE_PROF_ALL,NULL);
         next_prof_output = now + 60;
      }
   }
   sge_prof_cleanup();

   sge_shutdown((void**)&ctx, execd_exit_state);
   DRETURN(execd_exit_state);
}